def test_idf_model(self): data = [ Vectors.dense([1, 2, 6, 0, 2, 3, 1, 1, 0, 0, 3]), Vectors.dense([1, 3, 0, 1, 3, 0, 0, 2, 0, 0, 1]), Vectors.dense([1, 4, 1, 0, 0, 4, 9, 0, 1, 2, 0]), Vectors.dense([2, 1, 0, 3, 0, 0, 5, 0, 2, 3, 9]) ] model = IDF().fit(self.sc.parallelize(data, 2)) idf = model.idf() self.assertEqual(len(idf), 11)
#get url list url_dict = [] url_file = open("../../data/corpus.txt", "r").readlines() for val in url_file: d = re.split("###", val.strip()) url = d[1] url_dict.append(url) doc_num = len(url_dict) #calculate TF-IDF feature word_dict_size = 504927 hashingTF = HashingTF(word_dict_size) tf_words = hashingTF.transform(word_file.map(processWords)) tf_raw = hashingTF.transform(txt_file.map(processCorpus)) idf = IDF().fit(tf_raw) max_idf = float(idf.idf().max()) tf = tf_raw.map(processTF) tfidf = idf.transform(tf).map(processTFIDF(max_idf)) tfidf.persist(StorageLevel.MEMORY_AND_DISK) tf.persist(StorageLevel.MEMORY_AND_DISK) tf_raw.persist(StorageLevel.MEMORY_AND_DISK) #get word dictionary words_all = open("../../data/all_words.txt", "r").readlines() words_all = [val.strip() for val in words_all] word_dict = {val: 1 for val in words_all} #get inverted index temp1 = txt_file.map(filterWords(word_dict)) temp2 = temp1.flatMap(lambda line: line) inverted_index = temp2.groupByKey()