words = lines.flatMap(parseContext) words_swap = words.map(lambda (x, y): (y, x)) wordcount = words.map(lambda s: (s, 1)).reduceByKey(lambda a, b: a + b) wordcount_page = words_swap.map(lambda s: (s, 1)).reduceByKey(lambda a, b: a + b) count_page = words.map(lambda (a, b): (a, 1)).reduceByKey(lambda a, b: a + b) doc_word = words_swap.distinct().map(lambda (a, b): (a, 1)).reduceByKey(lambda a, b: a + b) app = [] for (((id, title), word), n) in wordcount.collect(): word_page = words.filter(lambda x: (id, title) in x).count() word_all_page = words.filter(lambda x: word in x).distinct().count() tf_idf = (n / word_page) * math.log((doc_count / word_all_page)) app.append([(id, title, word, tf_idf)]) ##part2 read as RDD v = sc.parallelized(app) trans = v.map(lambda (a, b): (a, list(b))).groupByKey() ##apend word as list by id ##key pair similarity(e-distance) def similar(wf): fun_result = [] list1 = {} list2 = {} for item in v[0][1]: fun_result.append(item[0]) list1.setdefault(item[0],item[1]) for item in v[1][1]: if item[0] not in fun_result: fun_result.append(item[0]) list2.setdefault(item[0],item[1]) result1 = [] result2 = []