def calc_jaccard_sim(df_to_process, df_match, thresh=.3, padded=True): if padded: df_processed = df_to_process.join(df_match, (F.size( F.array_intersect( df_to_process.ngrams_pad, df_match.ngrams_pad)) / F.size( F.array_union(df_to_process.ngrams_pad, df_match.ngrams_pad))) > thresh) else: df_processed = df_to_process.join( df_match, (F.size(F.array_intersect(df_to_process.ngrams, df_match.ngrams)) / F.size(F.array_union(df_to_process.ngrams, df_match.ngrams))) > thresh) return df_processed
def get_df_mincityear_onw_cit(df_ani): return (df_ani.filter(sort_pub_year + ' >= ' + mincityear).withColumn( 'references_u', func.array_distinct('references')).select( func.col('Eid').alias('CitingEid'), func.explode('references_u').alias('Eid'), func.when( func.col('source.srcid').isin(discontinued_sources), func.lit(int(1))).otherwise(func.lit( int(0))).alias('isDiscontinuedCiting'), func.col('Au.auid').cast('array<long>').alias('CitingAuids') ).join( df_ani.select( 'Eid', func.col('Au.auid').cast('array<long>').alias('CitedAuids')), ["Eid"]).withColumn( 'overLappingAuthors', func.size(func.array_intersect( 'CitingAuids', 'CitedAuids'))).select( "CitingEid", "Eid", 'isDiscontinuedCiting', func.expr("IF(overLappingAuthors>0,1,0)").alias( 'isSelfCitation'), func.expr("IF(overLappingAuthors>0,NULL,CitingEid)"). alias('CitingEidNonSelf'), ).groupBy('Eid').agg( func.count('*').alias('CitationCount'), func.sum('isSelfCitation').alias('SelfCitationCount'), (func.count('*') - func.sum('isSelfCitation') ).alias('CitationCountNonSelf'), func.collect_list('CitingEid').alias('CitingEids'), func.collect_list('CitingEidNonSelf').alias( 'CitingEidsNonSelf'), func.sum("isDiscontinuedCiting").alias( 'CitationCountFromDiscontinuedSources')))
def count_neighborhood(df_to_process): df_cross_join = df_to_process.crossJoin(df_pre_neighborhood) df_processed = df_cross_join.withColumn( "size", F.size(F.array_intersect("token_filtered", "to_match"))) df_street = df_processed.filter(df_processed.size != 0) df_left = df_processed.filter( df_processed.size == 0).drop("to_match").drop("size") return "neighborhood", df_left, df_street.select( F.sum("_c1"), F.lit('neighborhood').alias("sem_type"))
def get_around_class_except(): try: print(f"{str(dt.now())} 零售店订购烟品规异常") co_cust = get_valid_co_cust(spark).select("cust_id") co_co_line = get_co_co_line(spark,scope=[1,1],filter="month")\ .select("cust_id","item_id","qty_ord") plm_item = get_plm_item(spark).select("item_id", "item_name") # 1.每个零售户 每类烟 的数量 cust_item_sum = co_co_line.join(plm_item, "item_id") \ .groupBy("cust_id", "item_name") \ .agg(f.sum("qty_ord").alias("cust_item_sum")) #2.每个零售户订购量前三的烟 win = Window.partitionBy("cust_id").orderBy( col("cust_item_sum").desc()) rank3 = cust_item_sum.withColumn("rank", f.row_number().over(win)) \ .where(col("rank") <= 3) \ .groupBy("cust_id") \ .agg(f.collect_list("item_name").alias("items")) win = Window.partitionBy("cust_id1").orderBy( col("one_km_item_sum").desc()) #每个零售户cust_id1 周边的零售户cust_id0 around_cust = get_around_cust(spark, 1).select("cust_id1", "cust_id0") """ 零售户cust_id1周边包含cust_id0这些零售户 1.第一个join,计算每个零售户cust_id1一公里内有哪些零售户cust_id0 2.第二个join,一公里内的cust_id0这些零售户所定各类烟的数量 cust_id0 与 cust_item_sum的cust_id 3.根据cust_id1 item_name 计算一公里内各类烟的数量 """ #3.每个零售户一公里内所定烟的前三 one_km_rank3 = around_cust.join(cust_item_sum, col("cust_id0") == col("cust_id")) \ .select("cust_id1", "item_name", "cust_item_sum") \ .groupBy("cust_id1", "item_name") \ .agg(f.sum("cust_item_sum").alias("one_km_item_sum")) \ .withColumn("rank", f.row_number().over(win)) \ .where(col("rank") <= 3) \ .groupBy("cust_id1") \ .agg(f.collect_list("item_name").alias("one_km_items")) colName = ["regulation_abno", "ciga_top3_last_month", "ciga_top3_km"] #四.求交集 长度为0,异常 rank3.join(one_km_rank3, col("cust_id") == col("cust_id1")) \ .where(f.size(f.array_intersect(col("items"), col("one_km_items"))) == 0) \ .withColumn(colName[0],f.lit(1))\ .withColumnRenamed("items",colName[1])\ .withColumnRenamed("one_km_items",colName[2])\ .join(co_cust,"cust_id")\ .foreachPartition(lambda x:write_hbase1(x,colName,hbase)) except Exception as e: tb.print_exc()
def jaccard_index(primary_col: str, secondary_col: str, output_col: str, df: DataFrame): """Calculate the intersection and union of two array columns""" return df.withColumn( output_col, F.when( F.col(primary_col).isNull() | F.col(secondary_col).isNull(), None). otherwise( F.size(F.array_intersect(F.col(primary_col), F.col(secondary_col))) / F.size(F.array_union(F.col(primary_col), F.col(secondary_col)))), )
def scd_analyze(df, merge_on=None, state_col='_state', updated_col='_updated'): add_ids = '##add_ids' del_ids = '##del_ids' upd_ids = '##upd_ids' c = set(df.columns).difference({state_col, updated_col}) colnames = [x for x in df.columns if x in c] on = merge_on or colnames on = on if isinstance(on, (list, tuple)) else [on] on = [c for c in on if c in colnames] s = on + [state_col, updated_col] cols = [x for x in df.columns if x not in s] a = df.filter(f'{state_col} = 0') \ .groupby(updated_col) \ .agg(F.collect_set(F.concat(*on)).alias(add_ids)) \ .select(updated_col, add_ids) d = df.filter(f'{state_col} = 1') \ .groupby(updated_col) \ .agg(F.collect_set(F.concat(*on)).alias(del_ids)) \ .select(updated_col, del_ids) res = a.join(d, on=updated_col, how='outer') res = res.select(updated_col, F.coalesce(add_ids, F.array([])).alias(add_ids), F.coalesce(del_ids, F.array([])).alias(del_ids)) if cols: agg_funcs = [(F.countDistinct(x) - F.lit(1)).alias(x) for x in cols] cnt = df.groupby(*on, updated_col).agg(*agg_funcs) agg_names = [F.lit(x) for x in cols] agg_sums = [F.sum(x) for x in cols] cnt = cnt.groupby(updated_col).agg( F.map_from_arrays(F.array(*agg_names), F.array(*agg_sums)).alias('changes')) res = res.join(cnt, on=updated_col) else: res = res.withColumn('changes', F.lit(None)) res = res.select('*', F.array_intersect(add_ids, del_ids).alias(upd_ids)) res = res.select( F.col(updated_col).alias('updated'), F.size(upd_ids).alias('upd'), F.size(F.array_except(add_ids, upd_ids)).alias('add'), F.size(F.array_except(del_ids, upd_ids)).alias('del'), 'changes') return res.orderBy('updated')
def count_address_street_name(df_to_process): udf_address_regex = F.udf(address_regex) df_cross_join = df_to_process.crossJoin(df_pre_street) df_processed = df_cross_join.withColumn( "size", F.size(F.array_intersect("token_filtered", "to_match"))) df_street = df_processed.filter(df_processed.size != 0).withColumn( "sem_type", F.lit(udf_address_regex(df_processed._c0))) df_left = df_processed.filter( df_processed.size == 0).drop("to_match").drop("size") return "address_street_name", df_left, df_street.groupBy('sem_type').agg({ '_c1': 'sum' }).select('sum(_c1)', 'sem_type')
def transform(self, date): self.df = self.df \ .groupBy('author') \ .agg(F.collect_set("link_id" ).alias('link_ids')) \ .repartition('author') self.df = self.df.alias('df1') \ .join(self.df.alias('df2')) \ .where('df1.author < df2.author') \ .select(F.col('df1.author').alias('author_1'), \ F.col('df2.author').alias('author_2'), \ F.size(F.array_intersect('df1.link_ids', 'df2.link_ids')) \ .alias('weight')) \ .where('weight > %d' % self.truncation) return self
def similaryBasedOnFollowers(data, minFollowers=20, debug=False): # We start by renaming the user column in line with the notation # above. data = data.withColumnRenamed('follows', 'u1') # ==== Step 1 ==== u1_fu1 = data.groupBy('u1').agg(F.collect_set( data.user).alias('fu1')).filter(F.size('fu1') >= minFollowers) if (debug): print('>> Step 1 :: u1 f(u1) <<') u1_fu1.show() # ==== Step 2 ==== # First create a "dual" of data by renaming columns. # This will help the subsequent join. u2_fu2 = u1_fu1.withColumnRenamed('u1', 'u2').withColumnRenamed('fu1', 'fu2') prod = u1_fu1.crossJoin(u2_fu2).filter(u1_fu1.u1 < u2_fu2.u2) if (debug): print('>> Step 2 :: u1 f(u1) u2 f(u2) <<') prod.show() # ==== Step 3 ==== prod2 = prod.withColumn('I', F.array_intersect(prod.fu1, prod.fu2)).withColumn( 'U', F.array_union(prod.fu1, prod.fu2)).drop('fu1', 'fu2') if (debug): print('>> Step 3 :: u1 u2 I(u1,u2) U(u1,u2) <<') #prod2.orderBy('I',ascending=False).show() prod2.show() # ==== Step 4 ==== result = prod2.withColumn('JI', F.size('I') / F.size('U')).drop('I', 'U') if (debug): print('>> Step 4 :: u1 u2 J(u1,u2) <<') result.show() return result
def verification(self, candDF, threshold, key1, key2, keep_cols1, keep_cols2): """ Input: $candDF is the output DataFrame from the 'filtering' function. $threshold is a float value between (0, 1] Output: Return a new DataFrame $resultDF that represents the ER result. It has five columns: id1, joinKey1, id2, joinKey2, jaccard Comments: There are two differences between $candDF and $resultDF (1) $resultDF adds a new column, called jaccard, which stores the jaccard similarity between $joinKey1 and $joinKey2 (2) $resultDF removes the rows whose jaccard similarity is smaller than $threshold """ return candDF.select( 'id1', 'id2', (size(array_intersect(key1,key2))\ / size(array_union(key1,key2))).alias('jaccard'), # keep certain columns *keep_cols1, *keep_cols2 ).where(col('jaccard') >= threshold)
def array_intersection(a: Column, b: Column) -> Column: """Calculate the intersection of two array columns""" return F.array_remove(F.array_intersect(a, b), "")
f.size(f.array_distinct("stem.result")).alias( "title_n_distinct_words"), f.size(f.expr("filter(pos.result, x -> x like 'V%')") ).alias("title_n_verbs"), f.size(f.expr("filter(pos.result, x -> x like 'N%')") ).alias("title_n_nouns"), f.size(f.expr("filter(pos.result, x -> x like 'PR%')") ).alias("title_n_pronouns"), f.size(f.expr("filter(pos.result, x -> x like 'J%')") ).alias("title_n_adjectives"), f.size(f.expr("filter(pos.result, x -> x like 'RB%')") ).alias("title_n_adverbs"), f.array_distinct(f.col("stem.result")).alias("title_words") )\ .withColumn("title_in_body_perc", f.size(f.array_intersect(f.col("title_words"), f.col("body_words")))/f.col("title_n_distinct_words"))\ .selectExpr("dataset_name", "post_id", "body_clean_nocode", "title", "body_n_sentences", "body_n_words", "body_n_distinct_words", "body_n_verbs", "body_n_nouns", "body_n_pronouns", "body_n_adjectives", "body_n_adverbs", "title_n_words", "title_n_distinct_words", "title_n_verbs",
readSongsDF.unpersist() print("Insert playlists") createPlaylistsDF = df_edit.withColumn( 'Exp_Results', F.explode('create.playlists')).select('Exp_Results.*') createPlaylistsDF.show(truncate=False) # Only consider playlists not in the source playlists print("Insert playlists Result") createPlaylistsDF = createPlaylistsDF.join( readPlaylistsDF, createPlaylistsDF.id == readPlaylistsDF.id, 'leftanti').join(readUserDF, createPlaylistsDF.user_id == readUserDF.id, 'inner').select( createPlaylistsDF.id, F.array_intersect(createPlaylistsDF.song_ids, F.array([F.lit(x) for x in songs ])).alias("song_ids"), createPlaylistsDF.user_id) readPlaylistsDF = readPlaylistsDF.union(createPlaylistsDF) createPlaylistsDF.unpersist() readUserDF.unpersist() readPlaylistsDF.orderBy('id').show() print("Delete playlists") deletePlaylistsDF = df_edit.withColumn( 'id', F.explode('delete.playlist_ids')).select("id") deletePlaylistsDF.show(truncate=False) # Only delete playlists that exist in the source playlists print("Delete playlists Result")
def tag_info_df(spark): """ Extract features from the tags of a post Args: spark (SparkSession): used to run queries and commands Returns: DataFrame: With columns [ (post)_Id, #tags, contains_language_tag, contains_platform_tag ] """ language_list = ["abap", "abc", "actionscript", "ada", "algol", "algol 58", "algol 60", "algol w", "algol 68", "alice", "amiga e", "apex", "apl", "applescript", "argh!", "aargh!", "assembly", "assembly language", "autolisp", "autolt", "awk", "b", "bash", "basic", "ballerina", "bbc basic", "bc", "bcpl", "blitz basic", "bourne shell", "brainfuck", "c", "c++", "c#", "cfml", "cl", "classic visual basic", "clean", "clipper", "clojure", "cobol", "comal", "common lisp", "coffeescript", "crystal", "c shell", "ct", "d", "darkbasic", "dart", "decimal basic", "delphi", "delta cobol", "div games studio", "egl", "eiffel", "elixir", "elm", "emacs lisp", "erlang", "euphoria", "f#", "factor", "fenix project", "forth", "fortran", "foxpro", "gambas", "gcl", "gml", "go", "grasshopper", "groovy", "hack", "haskell", "hypertalk", "icon", "inform", "io", "ironpython", "j", "just another language", "java", "javascript", "just basic", "jscript", "julia", "korn shell", "kotlin", "labview", "ladder logic", "leet", "liberty basic", "lisp", "logo", "lua", "m4", "machine", "machine language", "malbolge", "maple", "matlab", "m-code", "mercury", "ml", "modula-2", "mondrian", "mql4", "msl", "natural", "oberon", "objective-c", "objectpal", "object pascal", "ocaml", "opencl", "openedge abl", "oz", "pascal", "pawn", "perl", "php", "piet", "pl/1", "pl/i", "pl/sql", "pl/pgsql", "postscript", "powerbasic", "powerbuilder", "powershell", "processing", "progress", "prolog", "providex", "purebasic", "python", "q#", "qbasic", "r", "raku", "rexx", "ring", "rpg", "ruby", "rust", "sas", "scala", "sed", "scheme", "scratch", "scratch jr.", "seed7", "self", "simula", "smalltalk", "smallbasic", "snobol", "solidity", "spark", "spss", "sql", "stata", "swift", "tcl", "tex", "ti-basic", "transact-sql", "t-sql", "turbobasic", "turbo c", "turbo pascal", "typescript", "ubasic", "vala", "vala/genie", "vb", "vbs", "vbscript", "verilog", "vhdl", "visual basic", "visual c", "visual foxpro", "visual objects", "vbscripts", "whitespace", "xslt", "xquery", "yaml"] language_list_col = array(*[lit(x) for x in language_list]) platform_list = ["arthur", "arx", "mos", "risc-ix", "risc-os", "amigaos", "amigaos-1.0-3.9", "amigaos-4", "amiga-unix", "amsdos", "contiki", "cp-m-2.2", "cp-m-plus", "symbos", "apple-ii", "apple-dos", "apple-pascal", "prodos", "gs-os", "gno-me", "apple-iii", "apple-sos", "apple-lisa", "apple-macintosh", "classic-mac-os", "a-ux", "copland", "mklinux", "pink", "rhapsody", "macos", "macos-server", "apple-network-server", "ibm-aix", "apple-messagepad", "newton-os", "iphone", "ios", "ipad", "ipados", "apple-watch", "watchos", "apple-tv", "tvos", "a-rose", "ipod-software", "netbsd", "domain-os", "atari-dos", "atari-tos", "atari-multitos", "xts-400", "beos", "beia", "beos-r5.1d0", "magnussoft-zeta", "unix", "unix-time-sharing-system-v6", "pwb-unix", "cb-unix", "unix-time-sharing-system-v7", "unix-system-iii", "unix-system-v", "unix-time-sharing-system-v8", "unix-time-sharing-system-v9", "unix-time-sharing-system-v10", "besys", "plan-9-from-bell-labs", "inferno", "burroughs-mcp", "chippewa-operating-system", "kronos", "nos", "scope", "puffin-os", "convergent-technologies-operating-system", "cromemco-dos", "cromix", "aos", "dg-ux", "rdos", "datapoint-2200", "datapoint", "deos", "heartos", "cp-m", "personal-cp-m", "cp-m-68k", "cp-m-8000", "cp-m-86", "cp-m-86-plus", "personal-cp-m-86", "mp-m", "mp-m-ii", "mp-m-86", "mp-m-8-16", "concurrent-cp-m", "concurrent-cp-m-86", "concurrent-cp-m-8-16", "concurrent-cp-m-68k", "dos", "concurrent-dos", "concurrent-pc-dos", "concurrent-dos-8-16", "concurrent-dos-286", "concurrent-dos-xm", "concurrent-dos-386", "concurrent-dos-386-mge", "concurrent-dos-68k", "flexos", "flexos-186", "flexos-286", "siemens-s5-dos-mt", "ibm-4680-os", "ibm-4690-os", "toshiba-4690-os", "flexos-386", "flexos-68k", "multiuser-dos", "cci-multiuser-dos", "datapac-multiuser-dos", "datapac-system-manager", "ims-multiuser-dos", "real-32", "real-ng", "dos-plus", "dr-dos", "palmdos", "star-trek", "novell-dos", "opendos", "batch-11-dos-11", "hp-ux", "multi-programming-executive", "nonstop", "os-8", "rsts-e", "rsx-11", "rt-11", "tops-10", "tenex", "tops-20", "digital-unix", "ultrix", "vms", "waits", "ose", "towns-os", "os-iv", "msp", "msp-ex", "real-time-multiprogramming-operating-system", "gcos", "multics", "chromium-os", "chrome-os", "container-optimized-os", "android", "glinux", "fuchsia", "integrity", "integrity-178b", "u-velosity", "vulcan-o-s", "harris-unix", "hdos", "ht-11", "hp-multi-programming-executive", "nonstop-os", "cp-6", "harmony-os", "irmx", "isis", "compatible-time-sharing-system", "gm-os-&-gm-naa-i-o", "ibsys", "ijmon", "share-operating-system", "university-of-michigan-executive-system", "os-360-and-successors", "os-360", "mft", "mft-ii", "mvt", "system-370", "os-vs1", "multiple-virtual-storage", "mvs-xa", "mvs-esa", "os-390", "phoenix-mvs", "z-os", "dos-360-and-successors", "bos-360", "tos-360", "dos-360", "dos-vs", "dos-vse", "vse-sp", "z-vse", "cp-cms", "cp-40", "cp-67", "vm-370", "vm-xa", "virtual-machine", "z-vm", "acp", "tpf", "z-tpf", "unix-like", "aix-370", "aix-esa", "opensolaris-for-system-z", "uts", "linux-on-ibm-z", "mts", "tss-360", "music-sp", "orvyl-and-wylbur", "pc-dos", "os-2", "os-2-warp", "ecomstation", "arcaos", "aix", "ibm-series-1", "edx", "rps", "cps", "serix", "ibm-1130", "dms", "ibm-1800", "tsx", "mpx", "ibm-8100", "dpcx", "dppx", "ibm-system-3", "ibm-system-34", "ibm-system-38", "cpf", "ibm-system-88", "stratus-vos", "as-400", "os-400", "i5-os", "ibm-i", "workplace-os", "k42", "dynix", "j", "george", "executive", "tme", "icl-vme", "vme-k", "remix-os", "lynxos", "microc-os-ii", "microc-os-iii", "xenix", "msx-dos", "ms-dos", "dos-v", "windows", "windows-1.0", "windows-2.0", "windows-3.0", "windows-3.1x", "windows-for-workgroups-3.1", "windows-3.2", "windows-for-workgroups-3.11", "windows-95", "windows-98", "windows-millennium-edition", "windows-nt", "windows-nt-3.1", "windows-nt-3.5", "windows-nt-3.51", "windows-nt-4.0", "windows-2000", "windows-xp", "windows-server-2003", "windows-fundamentals-for-legacy-pcs", "windows-vista", "windows-azure", "windows-home-server", "windows-server-2008", "windows-7", "windows-phone-7", "windows-server-2008-r2", "windows-home-server-2011", "windows-server-2012", "windows-8", "windows-phone-8", "windows-8.1", "windows-phone-8.1", "windows-server-2012-r2", "xbox-one-system-software", "windows-10", "windows-10-mobile", "windows-server-2016", "windows-server-2019", "windows-ce", "windows-ce-3.0", "windows-ce-5.0", "windows-ce-6.0", "windows-embedded-compact-7", "windows-embedded-compact-2013", "windows-mobile", "singularity", "midori", "xbox-360-system-software", "azure-sphere", "threadx", "altair-dos", "mobilinux", "tmx", "imos", "vrx", "es", "nextstep", "netware", "unixware", "novell-supernos", "novell-corsair", "novell-expose", "open-enterprise-server", "rtxc-quadros", "time-sharing-operating-system", "dspnano-rtos", "bada", "tizen", "sco-unix", "sco-open-desktop", "sco-openserver", "berkeley-timesharing-system", "pikeos", "trsdos", "color-basic", "newdos-80", "deskmate", "edos", "ti-rtos-kernel", "tron", "t-kernel", "exec-i", "exec-ii", "exec-8", "vs-9", "wps", "ois", "vxworks", "z80-rio", "zorin-os", "lisp-machines--inc.", "symbolics", "texas-instruments", "xerox", "andos", "csi-dos", "mk-dos", "pilot", "perq", "elbrus", "eos", "elxsi", "mai-basic-four", "michigan-terminal-system", "es-evm", "pc-mos-386", "buran", "sintran-iii", "skyos", "soda", "theos", "tsx-32", "dx10", "aegis", "coherent", "dc-osx", "dnix", "helios", "interactive-unix", "irix", "meikos", "os-9", "os-9000", "osf-1", "openstep", "qnx", "rmx", "sinix", "solaris", "sunos", "super-ux", "system-v", "system-v-at--386", "trusted-solaris", "uniflex", "unicos", "zenix", "minix", "bsd", "freebsd", "dragonflybsd", "midnightbsd", "ghostbsd", "trueos", "openbsd", "bitrig", "darwin", "gnu", "linux", "redox", "android-x86", "cray-linux-environment", "opensolaris", "illumos", "openindiana", "nexenta-os", "smartos", "rtems", "haiku", "syllable-desktop", "vsta", "plurix", "tunis", "dahliaos", "cosmos", "freedos", "genode", "ghost-os", "its", "osfree", "osv", "phantom-os", "reactos", "sharpos", "templeos", "visopsys", "research-unix", "amoeba", "croquet", "eros", "capros", "harmony", "helenos", "house", "ilios", "l4", "mach", "nemesis", "spring", "the-multiprogramming-system", "thoth", "v", "verve", "xinu", "86-dos", "dr-dos-startrek", "dr-dos-winbolt", "pts-dos", "turbodos", "desqview", "x-windowing", "banyan-vines", "cambridge-ring", "cisco-ios", "cisco-nx-os", "ctos", "data-ontap", "extremeware", "extremexos", "fabric-os", "junos", "network-operating-system", "novell-open-enterprise-server", "plan-9", "blis-cobol", "bluebottle", "bs1000", "bs2000", "bs3000", "flex9", "gem", "geos", "javaos", "jnode", "jx", "kernal", "merlin", "morphos", "fujitsu", "oberon-(operating-system)", "osd-xc", "pick", "primos", "sinclair-qdos", "ssb-dos", "symobi", "tripos", "ucsd-p-system", "vos", "vos3", "vm2000", "visi-on", "vps-vm", "aros", "atheos", "baremetal", "dexos", "emutos", "lse-os", "menuetos", "kolibrios", "toaruos", "ponyos", "serenityos", "dip-dos", "embedded-linux", "replicant", "lineageos", "list-of-custom-android-distributions", "firefox-os", "angstrom-distribution", "familiar-linux", "maemo", "openzaurus", "webos", "access-linux-platform", "openmoko-linux", "ophone", "meego", "moblin", "motomagx", "qt-extended", "sailfish-os", "ubuntu-touch", "postmarketos", "magic-cap", "palm-os", "pen-geos", "penpoint-os", "pvos", "symbian-os", "epoc", "pocket-pc", "windows-phone", "ipodlinux", "iriver-clix", "rockbox", "blackberry-os", "symbian-platform", "blackberry-10", "catos", "ios-xr", "lancom-systems", "openwrt", "dd-wrt", "lede", "gargoyle", "librecmc", "zeroshell", "rtos", "m0n0wall", "opnsense", "pfsense", "apache-mynewt", "chibios-rt", "erika-enterprise", "ecos", "nucleus-rtos", "nuttx", "ncos", "freertos--openrtos-and-safertos", "openembedded", "psos", "rex-os", "riot", "rom-dos", "tinyos", "rt-thread", "windows-iot", "windows-embedded", "wombat-os", "zephyr", "brickos", "lejos", "cambridge-cap-computer", "flex-machine", "hydra", "keykos"] # generated from util/platform_list.rb platform_list_col = array(*[lit(x) for x in platform_list]) df = spark.read.parquet("/user/***REMOVED***/StackOverflow/PostHistory.parquet") \ .select(["_PostId", "_Text", '_PostHistoryTypeId']) \ .filter(col("_PostHistoryTypeId") == 3) \ .withColumn("_Tags", expr("substring(_Text, 2, length(_Text) - 2)")) \ .withColumn("_Tags", split(col("_Tags"), "><")) \ .withColumn("#tags", when(size("_Tags") < 0, 0).otherwise(size("_Tags"))) \ .withColumn("contains_language_tag", size(array_intersect("_Tags", language_list_col)) > 0) \ .withColumn("contains_platform_tag", size(array_intersect("_Tags", platform_list_col)) > 0) \ .drop("_Tags", "_PostHistoryTypeId", "_Text") \ .withColumnRenamed('_PostId', '_Id') return df
def get_tokens_match_over_diff(df_to_process): df_processed = df_to_process.withColumn( "score", F.size(F.array_intersect("token_filtered", "to_match")) / F.size("token_filtered")) return df_processed
df_mincityear_onw_cit=( df_ani .filter(sort_pub_year+' >= '+mincityear) .select( func.col('Eid').alias('CitingEid'), func.explode('citations').alias('Eid'), func.col('Au.auid').alias('CitingAuids') ) .distinct() .join( df_ani.select( 'Eid', func.col('Au.auid').alias('CitedAuids') ),["Eid"] ) .withColumn('overLappingAuthors',func.size(func.array_intersect('CitingAuids','CitedAuids'))) .select( "CitingEid", "Eid", func.expr("IF(overLappingAuthors>0,1,0)").alias('isSelfCitation'), func.expr("IF(overLappingAuthors>0,NULL,CitingEid)").alias('CitingEidNonSelf'), ) .groupBy('Eid') .agg( func.count('*').alias('CitationCount'), func.sum('isSelfCitation').alias('SelfCitationCount'), (func.count('*')-func.sum('isSelfCitation')).alias('CitationCountNonSelf'), func.collect_list('CitingEid').alias('CitingEids'), func.collect_list('CitingEidNonSelf').alias('CitingEidsNonSelf'), ) )
def insertDataToGraph(self): spark = self.sparkSession neo4j = self.neo4jDriver.session() sc = spark.sparkContext feats = self.user_features_df list_cat = self.list_categories cat_count = len(list_cat) #import edges e = self.edges_df self.nodes_df = e.select("Source").union( e.select("Target")).distinct().withColumnRenamed('Source', 'id') n = self.nodes_df print(feats.count(), list_cat, e.count(), n.count()) feats.printSchema() #cache dataframes feats.cache() e.cache() n.cache() #add category property to u u_focus_rate = feats.select( col('id'), col("user_features{}.dict_focus_rate".format(( "_" + self.method_name ) if len(self.method_name) > 0 else "")).alias("dict_focus_rate")) u_with_category = u_focus_rate.withColumn( "userCategory", array([lit(c) for c in list_cat])) for cat in list_cat: u_with_category = u_with_category.withColumn( "temp", when( col("dict_focus_rate.{}".format(cat)) < 1 / cat_count, array_remove(u_with_category["userCategory"], cat)).otherwise( u_with_category["userCategory"])).drop( "userCategory").withColumnRenamed( "temp", "userCategory") u_with_category = u_with_category.select("id", "userCategory") #join n and u_with_category n_with_category = n.join(u_with_category, "id", how="left") #add category columns to e e_with_category = e.join(n_with_category, e.Source == n_with_category.id, how="left").withColumnRenamed( "userCategory", "sourceCategory").select( "Source", "Target", "sourceCategory") e_with_category = e_with_category.join( n_with_category, e_with_category.Target == n_with_category.id, how="left").withColumnRenamed("userCategory", "targetCategory").select( "Source", "Target", "sourceCategory", "targetCategory") #determine intersection between sourceCategory and targetCategory e_with_category = e_with_category.withColumn( "Categories", array_intersect(e_with_category["sourceCategory"], e_with_category["targetCategory"])) #flatten out categories of edges e_with_category = e_with_category.withColumn( "Category", explode(col("Categories"))).select("Source", "Target", "Category") print("e_with_category", e_with_category.count()) e_with_category.printSchema() ## Insert data insert_query = ''' UNWIND {triples} as triple MERGE (p1:User {id:triple[0]}) MERGE (p2:User {id:triple[1]}) WITH p1,p2,triple CALL apoc.create.relationship(p1, triple[2], {}, p2) YIELD rel RETURN * ''' e_listoftriples = e_with_category.toPandas()[[ 'Source', 'Target', 'Category' ]].values.tolist() print("e_listoftriples:", len(e_listoftriples)) batches = list(self.generate_batches(e_listoftriples, 7000)) for batch in batches: neo4j.run(insert_query, parameters={"triples": batch}) e_with_category.show() print("batches size:", len(batches), " last batch:", len(batches[-1]))