def levenshtein_matrix(df, col_name): """ Create a couple of column with all the string combination :param df: :param col_name: :return: """ df = keycollision.fingerprint(df, col_name) col_fingerprint = col_name + "_FINGERPRINT" col_distance = col_name + "_LEVENSHTEIN_DISTANCE" temp_col_1 = col_name + "_LEVENSHTEIN_1" temp_col_2 = col_name + "_LEVENSHTEIN_2" # Prepare the columns to calculate the cross join df = df.select(col_fingerprint).distinct().select( F.col(col_fingerprint).alias(temp_col_1), F.col(col_fingerprint).alias(temp_col_2)) # Create all the combination between the string to calculate the levenshtein distance df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \ .withColumn(col_distance, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2))) return df
def levenshtein_matrix(df, input_col): """ Create a couple of column with all the string combination :param df: Spark Dataframe :param input_col: :return: """ df = keycollision.fingerprint(df, input_col) # df.table() fingerprint_col = name_col(input_col, FINGERPRINT_COL) distance_col_name = name_col(input_col, LEVENSHTEIN_DISTANCE) temp_col_1 = input_col + "_LEVENSHTEIN_1" temp_col_2 = input_col + "_LEVENSHTEIN_2" # Prepare the columns to calculate the cross join df = df.select( F.col(fingerprint_col).alias(temp_col_1), F.col(fingerprint_col).alias(temp_col_2)).distinct() # Create all the combination between the string to calculate the levenshtein distance df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \ .withColumn(distance_col_name, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2))) if Optimus.cache: df = df.cache() return df
def leven_helper(df, ref_df, cut_off, type_str): #print("size of reference_df",ref_df.count()) df_columns = df.columns # grab the non typed entries in the input df new_df = df.filter(df["true_type"].isNull()) #crossjoin null values with reference columns ref_columns = ref_df.columns levy_df = new_df.crossJoin(ref_df) #compute levy distance levy_df = levy_df.withColumn("word1_word2_levenshtein",\ levenshtein(lower(col(df_columns[0])), lower(col(ref_columns[0])))) #collect rows that were less than cutoff levy_df = levy_df.filter(levy_df["word1_word2_levenshtein"] <= cut_off) levy_columns = levy_df.columns levy_df = levy_df.groupBy(levy_columns[0]).min("word1_word2_levenshtein") levy_columns = levy_df.columns levy_df = levy_df.select(col(levy_columns[0]), \ col(levy_columns[1]).alias("min")) levy_columns = levy_df.columns levy_df = levy_df.drop("min") #df = df.withColumn("true_type", when(col(df_columns[0]).isin(levy_df[levy_columns[0]]), type_str).otherwise(df["true_type"])) levy_df = levy_df.collect() levy_df = [x[0] for x in levy_df] rdf = df.withColumn( "true_type", when(df[df_columns[0]].isin(levy_df), type_str).otherwise(df["true_type"])) return rdf
def count_city(df_to_process): df_processed = df_to_process.join( df_pre_city, F.levenshtein(F.lower(df_to_process._c0), F.lower(df_pre_city._c0)) < 3) df_left = df_to_process.filter(F.col("id").isin(df_processed["id"])) return 'city', df_left, df_processed.select( F.sum("_c1"), F.lit('city').alias("sem_type"))
def get_neighbors_notes(song, featureDF): comparator_value = song[0]["notes"] df_merged = featureDF.withColumn("compare", lit(comparator_value)) df_levenshtein = df_merged.withColumn( "distances_levenshtein", levenshtein(col("notes"), col("compare"))) #df_levenshtein.sort(col("word1_word2_levenshtein").asc()).show() result = df_levenshtein.select("id", "key", "scale", "distances_levenshtein") return result
def count_car_make(df_to_process): df_processed = df_to_process.join( df_pre_car_make, F.levenshtein(F.lower(df_to_process._c0), F.lower(df_pre_car_make._c0)) < 3) # df_processed = calc_jaccard_sim(df_to_process, df_pre_car_make) df_left = df_to_process.filter(F.col("id").isin(df_processed["id"])) return 'car_make', df_left, df_processed.select( F.sum("_c1"), F.lit('car_make').alias("sem_type"))
def count_city_agency_abbrev(df_to_process): df_processed = df_to_process.join( df_pre_city_agency_abbrev, F.levenshtein(F.lower(df_to_process._c0), F.lower(df_pre_city_agency_abbrev._c0)) < 1) # df_processed = calc_jaccard_sim(df_left, df_pre_city_agency_abbrev) df_left = df_to_process.filter(F.col("id").isin(df_processed["id"])) return 'city_agency', df_left, df_processed.select( F.sum("_c1"), F.lit('city_agency').alias("sem_type"))
def count_school_level(df_to_process): df_processed = df_to_process.join( df_pre_school_level, F.levenshtein(F.lower(df_to_process._c0), F.lower(df_pre_school_level._c0)) < 3) # df_processed = calc_jaccard_sim(df_to_process, df_pre_school_level) df_left = df_to_process.filter(F.col("id").isin(df_processed["id"])) return 'school_level', df_left, df_processed.select( F.sum("_c1"), F.lit('school_level').alias("sem_type"))
def spark_ratio(left, right): # TODO: sparkify this function df = df(['left', 'right']) df = df.withColumn('len', F.min(F.length('left'), F.length('right'))) df = df.withColumn('levenshtein', F.levenshtein('left', 'right')) df = df.withColumn('inv_edit_distance', F.col('len') - F.col('levenshtein')) df = df.withColumn('ratio', F.col('inv_edit_distance') / F.col('len')) df = df.withColumnRenamed('ratio', 'fuzzy') df = df.select(['fuzzy']) return df
def fuzzyspark(df, on, value): q_val = value df = df.select([on]) # TODO: Rework that part df = df.withColumn('query', F.lit(q_val).cast(F.StringType())) df = df.withColumn( 'len', F.min(F.length(on), F.lit(len(q_val)).cast(T.IntegerType()))) df = df.withColumn('levenshtein', F.levenshtein(on, 'query')) df = df.withColumn('score', F.col('levenshtein') / F.col('len')) df = df.select(['score']) return df
def load_audio_id_text_id_mapping(spark, input_catalogue_path: str): audio_df, text_df = load_audio_and_text_dfs(spark, input_catalogue_path) joined_df = audio_df.join(text_df, "identifier") joined_df = joined_df.withColumn( "levenshtein", F.levenshtein(joined_df.audio_document_id, joined_df.text_document_id), ) audio_to_text_mapping_df = joined_df.groupBy("identifier").applyInPandas( fuzzy_matching, schema=FUZZY_MATCHING_RETURN_TYPE ) return audio_to_text_mapping_df
def assign_alternative_match_word_based_on_lavenshtein( self, not_matched_df, df_vector_filler): not_matched_df_x_filler = not_matched_df.crossJoin( df_vector_filler.select(col(self.word_col_name).alias('match'))) df1_x_df2 = not_matched_df_x_filler.withColumn( "levenshtein", levenshtein(col('word'), col('match'))) return df1_x_df2.withColumn('overall_min', min(col("levenshtein")).over( Window.partitionBy(self.sentence_col_id, 'word'))) \ .where(col('overall_min') == col('levenshtein')) \ .withColumn('rank', row_number().over(Window.partitionBy(self.sentence_col_id, 'word').orderBy('match'))) \ .where(col('rank') == 1) \ .drop('levenshtein', 'overall_min', 'rank')
def anonimization(dataframe, marca): marca_control = marca.lower().replace(" ", "") stringDistanceDf = dataframe.\ withColumn("marca_limpia", regexp_replace(lower(col("marca")), " ", "")).\ withColumn("control_str", lit(marca_control)).\ withColumn("string_distance", levenshtein(col("marca_limpia"), col("control_str"))) new_column_2 = when(col("string_distance") <= 7, lit("marca")).otherwise(lit("desconocido")) finalDf = stringDistanceDf.\ withColumn("marca_anom", new_column_2).\ drop("marca","marca_limpia","control_str","string_distance") return finalDf
def user_ratings_match(df_movies,dfUserRatings): #function to find the closest match to user input movie title based on levenshtein distance myRatings = dfUserRatings.join(df_movies).select('*',levenshtein(dfUserRatings.User_Title,df_movies.Movies_Title).alias('distance')).cache() myRatings_best_title_match = myRatings.groupBy('User_Title').agg({'distance':'min'}).withColumnRenamed('min(distance)','min_dis') join_condition = [myRatings.User_Title == myRatings_best_title_match.User_Title ,myRatings.distance == myRatings_best_title_match.min_dis ] myRatings_movie_id = myRatings_best_title_match.join(myRatings,join_condition).select('movie_id','User_Ratings').withColumnRenamed('User_Ratings','Rating') myRatings_user_id = myRatings_movie_id.withColumn('user_id',myRatings_movie_id.Rating - myRatings_movie_id.Rating) return(myRatings_user_id)
def best_choice(dict, i, PG, seedArray, genome, sc): SC = [] for z in range(len(PG)): for pos_gen in PG[z]: seq = (dict[i], genome[pos_gen - seedArray[z]: pos_gen - seedArray[z] + len(dict[i])], seedArray[z], pos_gen) SC.append(seq) rddSeq = sc.parallelize(SC) schemaSeqDF = rddSeq.map(lambda x: Row(SEQ=x[0], GEN=x[1], POS_SEQ=x[2], POS_GEN=x[3])) df = sqlContext.createDataFrame(schemaSeqDF) df = df.withColumn("dist", F.levenshtein(F.col("SEQ"), F.col("GEN"))) val = (1 / float(len(dict[i]))) * 100 df = df.withColumn("percentage", val*F.col( "dist")).drop("dist") minDF = df.agg(min(col("percentage")).alias("percentage")) min_percentage = [x["percentage"] for x in minDF.rdd.collect()] df = df.filter(df.percentage == min_percentage[0]) return df,min_percentage
def get_neighbors_notes(song): df = spark.createDataFrame(notes, ["id", "key", "scale", "notes"]) filterDF = df.filter(df.id == song) comparator_value = filterDF.collect()[0][3] df_merged = df.withColumn("compare", lit(comparator_value)) df_levenshtein = df_merged.withColumn( "distances_levenshtein", levenshtein(col("notes"), col("compare"))) #df_levenshtein.sort(col("word1_word2_levenshtein").asc()).show() result = df_levenshtein.select("id", "key", "scale", "distances_levenshtein") aggregated = result.agg(F.min(result.distances_levenshtein), F.max(result.distances_levenshtein)) max_val = aggregated.collect()[0]["max(distances_levenshtein)"] min_val = aggregated.collect()[0]["min(distances_levenshtein)"] return result.withColumn('scaled_levenshtein', (result.distances_levenshtein - min_val) / (max_val - min_val)).select( "id", "key", "scale", "scaled_levenshtein")
def LEVEN(df): print("Computing Levenshtein for:", colName) types = {} df_columns = df.columns ############### # Cities ############### cities_columns = cities_df.columns cities_crossjoin = df.crossJoin(cities_df) cities_levy = cities_crossjoin.withColumn( "word1_word2_levenshtein", levenshtein(col(df_columns[0]), col('cities'))) cities_count = cities_levy.filter( cities_levy["word1_word2_levenshtein"] <= 2) if len(cities_count.take(1)) > 0: cities_frequency = cities_count.groupBy().sum().collect()[0][0] types['cities'] = cities_frequency ############### # Neighborhoods ############### neighborhood_columns = neighborhood_df.columns neighborhood_crossjoin = df.crossJoin(neighborhood_df) neighborhood_levy = neighborhood_crossjoin.withColumn( "word1_word2_levenshtein", levenshtein(col(df_columns[0]), col('neighborhood'))) neighborhood_count = neighborhood_levy.filter( neighborhood_levy["word1_word2_levenshtein"] <= 2) if len(neighborhood_count.take(1)) > 0: neighborhood_frequency = neighborhood_count.groupBy().sum( ).collect()[0][0] types['neighborhood'] = neighborhood_frequency ############### # Borough ############### borough_columns = borough_df.columns borough_crossjoin = df.crossJoin(borough_df) borough_levy = borough_crossjoin.withColumn( "word1_word2_levenshtein", levenshtein(col(df_columns[0]), col('borough'))) borough_count = borough_levy.filter( borough_levy["word1_word2_levenshtein"] <= 2) if len(borough_count.take(1)) > 0: borough_frequency = borough_count.groupBy().sum().collect()[0][0] types['borough'] = borough_frequency ############### # School Name ############### schoolname_columns = schoolname_df.columns schoolname_crossjoin = df.crossJoin(schoolname_df) schoolname_levy = schoolname_crossjoin.withColumn( "word1_word2_levenshtein", levenshtein(col(df_columns[0]), col('schoolname'))) schoolname_count = schoolname_levy.filter( schoolname_levy["word1_word2_levenshtein"] <= 2) if len(schoolname_count.take(1)) > 0: schoolname_frequency = schoolname_count.groupBy().sum().collect( )[0][0] types['schoolname'] = schoolname_frequency ############### # Color ############### color_columns = color_df.columns color_crossjoin = df.crossJoin(color_df) color_levy = color_crossjoin.withColumn( "word1_word2_levenshtein", levenshtein(col(df_columns[0]), col('color'))) color_count = color_levy.filter( color_levy["word1_word2_levenshtein"] <= 2) if len(color_count.take(1)) > 0: color_frequency = color_count.groupBy().sum().collect()[0][0] types['color'] = color_frequency ############### # Carmake ############### carmake_columns = carmake_df.columns carmake_crossjoin = df.crossJoin(carmake_df) carmake_levy = carmake_crossjoin.withColumn( "word1_word2_levenshtein", levenshtein(col(df_columns[0]), col('carmake'))) carmake_count = carmake_levy.filter( carmake_levy["word1_word2_levenshtein"] <= 2) if len(carmake_count.take(1)) > 0: carmake_frequency = carmake_count.groupBy().sum().collect()[0][0] types['carmake'] = carmake_frequency ############### # City Agency ############### cityagency_columns = cityagency_df.columns cityagency_crossjoin = df.crossJoin(cityagency_df) cityagency_levy = cityagency_crossjoin.withColumn( "word1_word2_levenshtein", levenshtein(col(df_columns[0]), col('cityagency'))) cityagency_count = cityagency_levy.filter( cityagency_levy["word1_word2_levenshtein"] <= 2) if len(cityagency_count.take(1)) > 0: cityagency_frequency = cityagency_count.groupBy().sum().collect( )[0][0] types['cityagency'] = cityagency_frequency ############## # Area of Study ############## areastudy_columns = areastudy_df.columns areastudy_crossjoin = df.crossJoin(areastudy_df) areastudy_levy = areastudy_crossjoin.withColumn( "word1_word2_levenshtein", levenshtein(col(df_columns[0]), col('areastudy'))) areastudy_count = areastudy_levy.filter( areastudy_levy["word1_word2_levenshtein"] <= 2) if len(areastudy_count.take(1)) > 0: areastudy_frequency = areastudy_count.groupBy().sum().collect( )[0][0] types['areastudy'] = areastudy_frequency ############## # Subjects ############## subjects_columns = subjects_df.columns subjects_crossjoin = df.crossJoin(subjects_df) subjects_levy = subjects_crossjoin.withColumn( "word1_word2_levenshtein", levenshtein(col(df_columns[0]), col('subjects'))) subjects_count = subjects_levy.filter( subjects_levy["word1_word2_levenshtein"] <= 2) if len(subjects_count.take(1)) > 0: subjects_frequency = subjects_count.groupBy().sum().collect()[0][0] types['subjects'] = subjects_frequency ############## # School Levels ############## schoollevels_columns = schoollevels_df.columns schoollevels_crossjoin = df.crossJoin(schoollevels_df) schoollevels_levy = schoollevels_crossjoin.withColumn( "word1_word2_levenshtein", levenshtein(col(df_columns[0]), col('schoollevels'))) schoollevels_count = schoollevels_levy.filter( schoollevels_levy["word1_word2_levenshtein"] <= 2) if len(schoollevels_count.take(1)) > 0: schoollevels_frequency = schoollevels_count.groupBy().sum( ).collect()[0][0] types['schoollevels'] = schoollevels_frequency ############## # Colleges ############## colleges_columns = college_df.columns college_crossjoin = df.crossJoin(college_df) college_levy = college_crossjoin.withColumn( "word1_word2_levenshtein", levenshtein(col(df_columns[0]), col('college'))) college_counts = college_levy.filter( college_levy["word1_word2_levenshtein"] <= 2) if len(college_counts.take(1)) > 0: college_frequency = college_counts.groupBy().sum().collect()[0][0] types['college'] = college_frequency ############## # Vehicle Type ############## vehicletype_columns = vehicletype_df.columns vehicletype_crossjoin = df.crossJoin(vehicletype_df) vehicletype_levy = vehicletype_crossjoin.withColumn( "word1_word2_levenshtein", levenshtein(col(df_columns[0]), col('vehicletype'))) vehicletype_counts = vehicletype_levy.filter( vehicletype_levy["word1_word2_levenshtein"] <= 2) if len(vehicletype_counts.take(1)) > 0: vehicletype_frequency = vehicletype_counts.groupBy().sum().collect( )[0][0] types['vehicletype'] = vehicletype_frequency ############## # Type of Location ############## typelocation_columns = typelocation_df.columns typelocation_crossjoin = df.crossJoin(typelocation_df) typelocation_levy = typelocation_crossjoin.withColumn( "word1_word2_levenshtein", levenshtein(col(df_columns[0]), col('typelocation'))) typelocation_counts = typelocation_levy.filter( typelocation_levy["word1_word2_levenshtein"] <= 2) if len(typelocation_counts.take(1)) > 0: typelocation_frequency = typelocation_counts.groupBy().sum( ).collect()[0][0] types['typelocation'] = typelocation_frequency ############## # Parks ############## parks_columns = parks_df.columns parks_crossjoin = df.crossJoin(parks_df) parks_levy = parks_crossjoin.withColumn( "word1_word2_levenshtein", levenshtein(col(df_columns[0]), col('parks'))) park_counts = parks_levy.filter( parks_levy['word1_word2_levenshtein'] <= 2) if len(park_counts.take(1)) > 0: #will this indexing cause issues if first column is integer schema? parks_frequency = park_counts.groupBy().sum().collect()[0][0] types['parks'] = parks_frequency ################ # Building Codes ################ building_columns = building_code_df.columns building_crossjoin = df.crossJoin(building_code_df) building_code_levy = building_crossjoin.withColumn( "word1_word2_levenshtein", levenshtein(col(df_columns[0]), col('building_codes'))) building_counts = building_code_levy.filter( building_code_levy['word1_word2_levenshtein'] <= 1) if len(building_counts.take(1)) > 0: building_code_frequency = building_counts.groupBy().sum().collect( )[0][0] types['building_code'] = building_code_frequency return types
#joining steps: # 1 join both lv and yelp df # 2 find leven. distance # 3 find min leven distance group by lv id # 4 join step 3 with step 2 based on leven distance # 5 remove lv_ids where count is more than 1, since there is a tie in step 3 # 6 save result as parquet with lv_id and yelp_id #step 1 combined_data = lv_df.join(yelp_df) #step 2 combined_data = combined_data.select("yelp_id",\ combined_data.id.alias("lv_id"), \ levenshtein("lv_full_form", "yelp_full_form").alias("leven1")) #step 3 min_leven = combined_data.groupby("lv_id").min("leven1")\ .select(col("lv_id").alias("lv_id2"), \ col("min(leven1)").alias("m_leven")) #step 4 combined_data = combined_data.select(combined_data.lv_id, \ combined_data.yelp_id, \ combined_data.leven1.alias("lev")) combined2 = min_leven.join(combined_data,\ [min_leven.lv_id2 == combined_data.lv_id, \ min_leven.m_leven == combined_data.lev]) #step 5
def levenshtein_json(df, input_col): """ Output the levenshtein distance in json format :param df: Spark Dataframe :param input_col: :return: """ df = keycollision.fingerprint(df, input_col) # df.table() fingerprint_col = name_col(input_col, FINGERPRINT_COL) distance_col_name = name_col(input_col, LEVENSHTEIN_DISTANCE) temp_col_1 = input_col + "_LEVENSHTEIN_1" temp_col_2 = input_col + "_LEVENSHTEIN_2" # Prepare the columns to calculate the cross join result = df.select(input_col, F.col(fingerprint_col).alias(temp_col_1)).distinct() df = df.select(input_col, F.col(fingerprint_col).alias(temp_col_1), F.col(fingerprint_col).alias(temp_col_2)).distinct() # Create all the combination between the string to calculate the levenshtein distance df = df.select(temp_col_1).crossJoin(df.select(temp_col_2)) \ .withColumn(distance_col_name, F.levenshtein(F.col(temp_col_1), F.col(temp_col_2))) # if Optimus.cache: # df = df.cache() # Select only the string with shortest path distance_col = name_col(input_col, LEVENSHTEIN_DISTANCE) distance_r_col = input_col + "_LEVENSHTEIN_DISTANCE_R" temp_r = "TEMP_R" df_r = (df.rows.drop(F.col(distance_col) == 0).groupby(temp_col_1).agg( F.min(distance_col).alias(distance_r_col)).cols.rename( temp_col_1, temp_r)).repartition(1) df = df.join(df_r, ((df_r[temp_r] == df[temp_col_1]) & (df_r[distance_r_col] == df[distance_col]))) \ .select(temp_col_1, distance_col, temp_col_2).repartition(1) # Create the clusters/lists df = (df.groupby(temp_col_1).agg(F.collect_list(temp_col_2))) kv_dict = {} for row in result.collect(): _row = list(row.asDict().values()) kv_dict[_row[1]] = _row[0] kv_result_df = {} for row in df.collect(): _row = list(row.asDict().values()) kv_result_df[_row[0]] = _row[1] result = {} for k, v in kv_result_df.items(): a = result[kv_dict[k]] = [] for iv in v: a.append(kv_dict[iv]) return result
def main(): inputs = sys.argv[1] rating_file = sys.argv[2] output = sys.argv[3] conf = SparkConf().setAppName('movie recommendation') sc = SparkContext(conf=conf) assert sc.version >= '1.5.1' sqlContext = SQLContext(sc) """ sbaronia - getting files from directory and reading from it and using parse_rating_movie and parse_my_input for parsing the content of the files to an rdd""" movies_path = join(inputs, "movies.dat") ratings_path = join(inputs, "ratings.dat") read_ratings = sc.textFile(ratings_path) read_movies = sc.textFile(movies_path) read_mymovies = sc.textFile(rating_file) parse_ratings = read_ratings.map(lambda line : parse_rating_movie(line, "ratings.dat")).cache() parse_movies = read_movies.map(lambda line : parse_rating_movie(line, "movies.dat")).cache() parse_mymovies = read_mymovies.map(lambda line: parse_my_input(line)).cache() """ sbaronia - converting movie and rating data to dataframes """ schema_movie = StructType([StructField('movie_id', IntegerType(), True), StructField('movie_name', StringType(), True)]) movie_df = sqlContext.createDataFrame(parse_movies, schema=schema_movie).cache() schema_mymovie = StructType([StructField('ip_uid', IntegerType(), True), StructField('ip_mname', StringType(), True), StructField('ip_rating', IntegerType(), True), StructField('ldistance', IntegerType(), True)]) mymovie_df = sqlContext.createDataFrame(parse_mymovies, schema=schema_mymovie).cache() """ sbaronia - combining user input movies with movies data then finding Levenshtein distance with every movie and then finding the one with minimum Levenshtein distance as our best match""" movie_plus_ip = movie_df.join(mymovie_df, None, 'inner').cache() movie_plus_ip_distance = movie_plus_ip.withColumn('ldistance', levenshtein('movie_name','ip_mname')) mymovie_distance = movie_plus_ip_distance \ .groupBy('ip_uid', 'ip_mname') \ .min('ldistance') \ .withColumnRenamed('min(ldistance)','ldistance') \ .cache() """ sbaronia - join the tables to get only those movies with minimum Levenshtein distance and then from that table select columns necessary. Then create a test data for all movies with new user 0""" refined_movies = movie_plus_ip_distance.join(mymovie_distance, ['ip_uid', 'ip_mname', 'ldistance'], 'inner').cache() input_rating = refined_movies.select('ip_uid', 'movie_id', 'ip_rating').cache() input_rating_rdd = input_rating.rdd.map(lambda row1: (row1.ip_uid, row1.movie_id, float(row1.ip_rating))).cache() input_with_train = sc.union([input_rating_rdd, parse_ratings]).cache() test_newuser = parse_movies.map(lambda line: (0, line[0])).cache() """ sbaronia - train on all data including new one and then test on all movies for new user and sort them in descending order of ratings""" model = ALS.train(input_with_train, 10, 10, 0.1) predictions = model.predictAll(test_newuser) \ .map(lambda row1: (row1.rating, row1.product)) \ .sortByKey(ascending=False) \ .map(lambda row: (row[1], row[0])) \ .cache() final_rating = sqlContext.createDataFrame(predictions, ['movie_id', 'movie_rating']).cache() final_movie_rating = movie_df.join(final_rating, ['movie_id'], 'inner').sort("movie_rating", ascending=False).cache() final_movie_rating_rdd = final_movie_rating.rdd.map(lambda row: (str(row.movie_id) + ' :: ' + str(row.movie_name)) + ' :: ' + str(row.movie_rating)).coalesce(1).cache() final_movie_rating_rdd.saveAsTextFile(output)
#joining steps: # 1 join both lv and yelp df # 2 find leven. distance on name of restaurant/business # 3 find min leven distance group by lv id # 4 join step 3 with step 2 based on leven distance and lv_id # 5 repeat steps 2-4 now with leven on address # 6 remove lv_ids where count is more than 1, since there is a tie in step 5 # 7 save result as parquet with lv_id and yelp_id #step 1 combined_data = lv_df.join(yelp_df) #step 2, leven on names combined_data = combined_data.withColumn("leven_name", levenshtein(col("lv_name"), col("yelp_name"))) #step 3 min_leven = combined_data.groupby("lv_id").min("leven_name")\ .select(col("lv_id").alias("lv_id2"), \ col("min(leven_name)").alias("m_leven_name")) combined_data = combined_data.select(combined_data.lv_id, \ combined_data.yelp_id, \ combined_data.lv_addr,\ combined_data.yelp_addr,\ combined_data.leven_name.alias("lev_name")) #step 4 combined2 = min_leven.join(combined_data,\ [min_leven.lv_id2 == combined_data.lv_id, \
dist_udf = udf(tfidfDist, DoubleType()) res = res.withColumn('dist', dist_udf(res['idf1'], res['idf2'])) # Drop unnessesary columns from `data` and join in a new feature *"tfidfDistance"*. data = data.drop('words1', 'words2') data = data.join(res.selectExpr('id', 'dist as tfidfDistance'), on='id', how='inner') #data.select('id','tfidfDistance').show(6) print("created feature TF-IDF") #Add Levenshtein Distance as last feature, for both lemmas and questions from pyspark.sql.functions import levenshtein data = data.withColumn('lemma_leven', levenshtein('lemma1', 'lemma2')) data = data.withColumn('question_leven', levenshtein('question1', 'question2')) print('All Features Created in %d Minutes' % (float(format(time.time() - start_time)) / 60)) #output features outData = data.select(['id'] + featureNames + ['is_duplicate']) outData = outData.cache() print("Cached outData in %d Minutes" % (float(format(time.time() - start_time)) / 60)) outTrainFileName = "./AML_Project2_Data/train_features.csv" outTestFileName = "./AML_Project2_Data/test_features.csv"
# (5) Create a new column containing the full name for each driver. from pyspark.sql.functions import concat_ws drivers \ .withColumn("full_name", concat_ws(" ", "first_name", "last_name")) \ .select("first_name", "last_name", "full_name") \ .show(5) # (6) Create a new column containing the average star rating for each driver. drivers \ .withColumn("star_rating", round(col("stars") / col("rides"), 2)) \ .select("rides", "stars", "star_rating") \ .show(5) # (7) Find the rider names that are most similar to `Brian`. **Hint:** Use the # Levenshtein function. from pyspark.sql.functions import lit, levenshtein riders \ .select("first_name") \ .distinct() \ .withColumn("distance", levenshtein(col("first_name"), lit("Brian"))) \ .sort("distance") \ .show() # ## Cleanup # Stop the SparkSession: spark.stop()
def calculate_simillarity(new_df): new_df = new_df.withColumn(\ "matching_levenshtein_dist", (levenshtein(col("description_x"), col("description_y")))) print(new_df.collect())
cur_dataset = cur_dataset.withColumn('sem_type', get_semantic_type(f.col('_c0'))) # rule based mechanism cur_dataset = cur_dataset.withColumn( 'sem_type', f.when(f.col('sem_type') == 'null', get_rule_based(f.col('_c0'))).otherwise(f.col('sem_type'))) # based on soundex and edit distance cur_dataset = cur_dataset.withColumn( 'soundex_phon', f.when(f.col('sem_type') == 'null', f.soundex(f.col('_c0'))).otherwise( f.lit('null').cast(StringType()))) cur_dataset = cur_dataset.join( merged_df, [f.col('soundex_phon_cur') == f.col('soundex_phon')], 'left_outer').withColumn( 'edit_dist', f.levenshtein(f.col('column_value'), f.col('_c0'))) min_dataset = cur_dataset.groupBy('_c0').agg( f.min(f.col('edit_dist')).alias('min_edit_dist')).filter( f.col('min_edit_dist') <= 3).withColumnRenamed('_c0', 'c_value') temp_dataset = cur_dataset.join( min_dataset, [cur_dataset._c0 == min_dataset.c_value], 'left_outer').filter( cur_dataset.edit_dist == min_dataset.min_edit_dist) temp_dataset = temp_dataset.select( f.col('_c0').alias('c_value'), f.col('column_value').alias('col_value'), f.col('column_name').alias('col_name'), 'min_edit_dist') temp_dataset = temp_dataset.groupBy(f.col('c_value')).agg( f.first(f.col('col_name')).alias('col_name')) cur_dataset = cur_dataset.join( temp_dataset, [cur_dataset._c0 == temp_dataset.c_value],
#sort the rest alphabetically sortYT = sortYT.orderBy('title', ascending=True) #rename columns sortYT = sortYT.withColumnRenamed('asset_id', 'YT_ID') sortYT = sortYT.withColumnRenamed('title', 'YT_Title') sortYT = sortYT.withColumnRenamed('writers', 'YT_Writers') #---------------------------------------------- #Merging by title #---------------------------------------------- #Join DFs on titles with levenshtein distance from pyspark.sql.functions import levenshtein joinedDF = sortdt.join(sortYT, levenshtein(sortdt["Title"], sortYT["YT_Title"]) < 3) YTDT = joinedDF[[ 'Downtown_ID', 'YT_ID', 'Title', 'Downtown_Composer', 'YT_Writers', 'ratio' ]] #do levenshtein distance from pyspark.sql.functions import levenshtein ratioYTDT = YTDT.withColumn( 'ratio', levenshtein(col('Downtown_Composer'), col('YT_Writers'))) #keep all rows with ratio >= 85 #YTDT = YTDT.filter(YTDT['ratio']<= 15) #whats a good ld to stop at? #drop ratio column #YTDT = YTDT.drop('ratio') #save to output file YTDT.write.csv("matches.csv")
StructField('movieid', IntegerType(), False), StructField('rating', StringType(), False) ]) userrating_sql = sqlContext.createDataFrame(userrating_split, userschema) movies_sql = sqlContext.createDataFrame(movies_split, movieschema).cache() rating_sql = sqlContext.createDataFrame(rating_split, ratingschema) movie_prep = movies_sql.select('movieid') movies_join_usersrating = userrating_sql.join(movies_sql) rating_join_movies = rating_sql.join(movie_prep, ['movieid']) distmovies = movies_join_usersrating.select( 'movieid', 'tweetmovietitle', 'userrating', 'usermovietitle', 'userid', levenshtein('usermovietitle', 'tweetmovietitle').alias('min-dist')).cache() mindistmovies = distmovies.groupBy('usermovietitle').min( 'min-dist').withColumnRenamed('min(min-dist)', 'min-dist') user_joined = mindistmovies.join(distmovies, ['usermovietitle', 'min-dist']).select( 'userid', 'movieid', 'userrating') train_data = user_joined.unionAll(rating_join_movies).cache() rank = 10 numIterations = 10 model = ALS.train(train_data, rank, numIterations) movies = model.recommendProducts(0, 10)
StructField('userid', IntegerType(), False), StructField('movieid', IntegerType(), False), StructField('rating', StringType(), False) ]) userrating_sql = sqlContext.createDataFrame(userrating_split, userschema) movies_sql = sqlContext.createDataFrame(movies_split, movieschema).cache() rating_sql = sqlContext.createDataFrame(rating_split, ratingschema) movie_prep = movies_sql.select('movieid') movies_join_usersrating = userrating_sql.join(movies_sql) rating_join_movies = rating_sql.join(movie_prep,['movieid']) distmovies = movies_join_usersrating.select('movieid', 'tweetmovietitle', 'userrating', 'usermovietitle', 'userid',levenshtein('usermovietitle', 'tweetmovietitle').alias('min-dist')).cache() mindistmovies = distmovies.groupBy('usermovietitle').min('min-dist').withColumnRenamed('min(min-dist)', 'min-dist') user_joined = mindistmovies.join(distmovies, ['usermovietitle', 'min-dist']).select('userid', 'movieid', 'userrating') train_data = user_joined.unionAll(rating_join_movies).cache() rank = 10 numIterations = 10 model = ALS.train(train_data, rank, numIterations) movies = model.recommendProducts(0, 10) movies_rdd = sc.parallelize(movies, 1) moviespredict = sqlContext.createDataFrame(movies_rdd, ratingschema)
def main(argv=None): if argv is None: inputs = sys.argv[1] user = sys.argv[2] output = sys.argv[3] # Initialize Spark os.environ['PYSPARK_PYTHON'] = "python2" os.environ['PYTHONPATH'] = ':'.join(sys.path) conf = SparkConf().setAppName('movie_recommendation') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) # Load ratings data ratings_data = sc.textFile(inputs+"/ratings.dat") ratings = ratings_data.map(lambda l: l.split('::'))\ .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) # Load movies data movies_data = sc.textFile(inputs+"/movies.dat") movies = movies_data.map(lambda l: l.split('::'))\ .map(lambda l: (int(l[0]), l[1].encode('utf-8').strip())) movies.cache() # Load user rating user_rate_data = sc.textFile(user) user_rate = user_rate_data.map(lambda l: l.split(' '))\ .map(lambda l: (int(l[0]), list_to_string(l[1:]).encode('utf-8').strip())) user_rate_list = user_rate.collect() # Match movies name input by user to movie id in movies data user_rate_list_2 = [] for item in user_rate_list: user_title = sc.broadcast(item[1]) df_movie = sqlContext.createDataFrame(movies,['movieId','title']) df_movie.registerTempTable('movies') df_movie2 = sqlContext.sql("SELECT *, \"" + user_title.value + "\" as user_title FROM movies") \ .select('movieId','title',levenshtein('title', 'user_title').alias('distance')) movie_id = df_movie2.rdd.map(lambda x: (x['movieId'], x['title'], x['distance'])) \ .reduce(find_min) user_rate_list_2.append([0, movie_id[0], item[0]]) user_rate_rdd = sc.parallelize(user_rate_list_2) \ .map(lambda l: Rating(int(l[0]), int(l[1]), float(l[2]))) ratings_all = user_rate_rdd.union(ratings) ratings_all.cache() # Build the recommendation model using Alternating Least Squares ranks = [8, 20] numIters = [10, 20] bestModel = None bestMSE = float("inf") bestRank = 0 bestNumIter = -1 for rank, numIter in itertools.product(ranks, numIters): #Train Model model = ALS.train(ratings_all, rank, numIter) #Evaluate MSE testdata = ratings.map(lambda x: (x[0], x[1])) predictions = model.predictAll(testdata).map(lambda x: ((x[0], x[1]), x[2])) ratesAndPreds = ratings.map(lambda x: ((x[0], x[1]), x[2])).join(predictions) MSE = ratesAndPreds.map(lambda x: (x[1][0] - x[1][1])**2).mean() #print "MSE = %f for the model trained with " % MSE + \ # "rank = %d, and numIter = %d." % (rank, numIter) if (MSE < bestMSE): bestModel = model bestMSE = MSE bestRank = rank bestNumIter = numIter # Generate list recommended movies in descending order rated_movies = set([x[1] for x in user_rate_list_2]) candidates = movies.filter(lambda x: x[0] not in rated_movies) #only recommend movie not rated yet predictions = bestModel.predictAll(candidates.map(lambda x: (0, x[0]))) \ .map(lambda x: (x[1],x[2])).join(movies) \ .sortBy(lambda (movieid,(score,title)): score, ascending=False) #write recommendation outdata = predictions.map(lambda (movieId, (score, title)): "Movies: %s - Score:%0.2f" % (title, score)) outdata.saveAsTextFile(output)
gdp = spark.read.csv("final.csv", header=True, mode="DROPMALFORMED", schema=schema) gdpCountry = gdp.select("country").distinct() for country in pycountry.countries: countries.append([unicodedata.normalize('NFKD', country.name).encode('ascii','ignore'),\ unicodedata.normalize('NFKD', country.alpha_3).encode('ascii','ignore')]) countries = sc.parallelize(countries).toDF(['name', 'code']) joinedDF = gdpCountry.join(countries) joinedDF = joinedDF.select("country", "name", "code", levenshtein("country", "name").alias("lev")) cleanedData = joinedDF.filter(joinedDF.lev == 0) arr = cleanedData.select("name").rdd.map(lambda data: data["name"]).collect() con = cleanedData.select("country").rdd.map( lambda data: data["country"]).collect() countries = countries.filter(countries.name.isin(*arr) == False) gdpCountry = gdpCountry.filter(gdpCountry.country.isin(*con) == False) cleanedData.coalesce(1).write.csv("countryClean.csv", header=True) countries.coalesce(1).write.csv("ccode.csv", header=True) gdpCountry.coalesce(1).write.csv("leftover.csv", header=True)
def colsNameSimilarity(self, df, category=None, df2=None): """ :param df: A Spark Dataframe :param category: A string keyword to match :df2 : A second dataframe to match column names :return result_df : A dataframe having column_1, column_2, path similarity, levenshtein distance,soundex_equality """ # Clean up column names so that we can prevent future errors for colName, dtype in df.dtypes: if '.' in colName or '`' in colName or colName.strip() != colName: df = df.withColumnRenamed( colName, colName.strip().replace(".", "", "_").replace("`", "")) if (df2 == None): result_df = pd.DataFrame(columns=['Column_1', 'Path Similarity']) category_sys = wn.synsets(category) if (category_sys != []): cnt = 0 # put column names into appropriate bin for colName, dtype in df.dtypes: colName_ = colName.split("_") score = [] for i in range(len(colName_)): colName_sys = wn.synsets(colName_[i]) if (colName_sys != []): score.append(colName_sys[0].path_similarity( category_sys[0])) if (score != []): score = max(score) else: score = 0 result_df.loc[cnt] = [colName, score] cnt += 1 else: print("Similarity cannot be calculated") else: for colName, dtype in df2.dtypes: if '.' in colName or '`' in colName or colName.strip( ) != colName: df2 = df2.withColumnRenamed( colName, colName.strip().replace(".", "", "_").replace("`", "")) result_df = pd.DataFrame( columns=['Column_1', 'Column_2', 'Path Similarity']) cnt = 0 # put column names into appropriate bin for colName1, dtype in df.dtypes: colName_1 = colName1.split("_") for colName2, dtype2 in df2.dtypes: colName_2 = colName2.split("_") score = [] #print(colName_1, colName_2, score) for i in range(len(colName_1)): colName_sys_1 = wn.synsets(colName_1[i]) for j in range(len(colName_2)): colName_sys_2 = wn.synsets(colName_2[j]) if (colName_sys_1 != [] and colName_sys_2 != []): score.append(colName_sys_1[0].path_similarity( colName_sys_2[0])) score = [i for i in score if i != None] if (score != []): score = max(score) else: score = 0 result_df.loc[cnt] = [colName1, colName2, score] cnt += 1 result_df = result_df[result_df['Path Similarity'] > 0.5] if (result_df.empty is not True): result_df = self.spark.createDataFrame(result_df) if (category is None): result_df = result_df.withColumn("levenshtein distance", f.levenshtein(result_df["Column_1"],\ result_df["Column_2"])) result_df = result_df.withColumn("soundex_equality", f.soundex(result_df["Column_1"]) ==\ f.soundex(result_df["Column_2"])) else: result_df = result_df.withColumn("levenshtein distance", \ f.levenshtein(result_df["Column_1"],f.lit(category))) result_df = result_df.withColumn("soundex_equality", f.soundex(result_df["Column_1"]) ==\ f.soundex(f.lit(category))) else: schema = StructType([ StructField("Column_1", StringType(), True), StructField("Path Similarity", DoubleType(), True), StructField("levenshtein distance", DoubleType(), True), StructField("soundex_equality", DoubleType(), True), ]) result_df = self.spark.createDataFrame(self.sc.emptyRDD(), schema=schema) return result_df
# Generate Dataframe 2 for testing df2 = spark.createDataFrame( [ (['dan', 'ocean', '05/25/1983', 'medical code AAA']), (['danny', 'oceans11', '04/26/1982', 'medical code BBB']), (['tess', 'ocean', '02/10/1988', 'medical code CCC']), (['john', 'smith', '01/30/1980', 'medical code DDD']), (['john', 'smith', '09/30/1981', 'medical code EEE']) ], ['firstname','lastname','dob','medical_code'] ) df2.show(10,False) # 1) Concat relevant fields used for fuzzy matching into a field called join_id # 2) Apply levenshtein distance (which generates a score) # 3) Use this score as a join criteria # 4) Join on join_id joinedDF = df.join(df2, levenshtein( concat(df.dob,df.firstname,df.lastname), concat(df2.dob,df2.firstname,df2.lastname) ) < 5, how='left_outer' ) joinedDF.show(10,False) #ZEND