def _compute_tfid(texts: RDD) -> IDFModel: tf = HashingTF().transform(texts.map(lambda t: t.words)) tf.cache() idf = IDF().fit(tf) tfidfs = idf.transform(tf) text_tfs = texts.zip(tfidfs) return text_tfs.map(lambda t: t[0].set_tfidf(t[1]))
def tfidf(self): tf = HashingTF().transform(self._sents) self._tf = tf tf.cache() idf = IDF().fit(tf) self.idf = idf tfidf = idf.transform(tf) self._tfidf = dict(enumerate(tfidf.collect()))
def parseTextRDDToIndex(self, data, label=True): if label: labels = data.map(lambda line: float(line.split(" ", 1)[0])) documents = data.map(lambda line: line.split(" ", 1)[1].split(" ")) else: documents = data.map(lambda line: line.split(" ")) tf = HashingTF().transform(documents) tf.cache() idfIgnore = IDF(minDocFreq=2).fit(tf) index = idfIgnore.transform(tf) if label: return labels.zip(index).map( lambda line: LabeledPoint(line[0], line[1])) else: return index
def _compute_idf(texts: RDD) -> IDFModel: tf = HashingTF().transform(texts) tf.cache() idf = IDF().fit(tf) return idf
training_raw = sc.parallelize(traindata) labels = training_raw.map( lambda doc: doc["label"], # Standard Python dict access preservesPartitioning=True # This is obsolete. ) # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes: # First to compute the IDF vector and second to scale the term frequencies by IDF. tf = HashingTF(numFeatures=numfeatures).transform( ## Use much larger number in practice training_raw.map(lambda doc: doc["text"].split(), preservesPartitioning=True)) tf.cache() idf = IDF().fit(tf) tfidf = idf.transform(tf) # Combine using zip training = labels.zip(tf).map(lambda x: LabeledPoint(x[0], x[1])) # TEST DATA testlabel = testlabels.map(lambda line: float(line)) t = reviewdata1.collect() l = testlabel.collect() testdata = [{"text":t[i],"label":l[i]} for i in range(len(l))] test_raw = sc.parallelize(testdata) testlabels = test_raw.map(