def BuildSubstringFeature(ngrams, w2v, start, end, ctxSize, lookahead, stringCol='sentence'): gramSize = GramSize(ctxSize, lookahead) vecass = VectorAssembler(outputCol='feature') old_ngrams = ngrams #create a vector column for each context position for ctxpos in range(start, end): #create a column to hold the vector for this context position colName = 'ctx' + str(ctxpos) #create the vector for the context position ngrams = CreateSubstring(ngrams, stringCol, 'ngrams', gramSize, ' ', ctxpos, ctxpos + 1, True) ngrams = w2v.transform(ngrams).withColumnRenamed('vector', colName).drop('ngrams') if ctxpos == start: ngrams = vecass.setParams(inputCols = [colName]).transform(ngrams) ngrams = ngrams.withColumnRenamed('feature', 'tmp') else: ngrams = vecass.setParams(inputCols = ['tmp', colName]).transform(ngrams).drop('tmp') ngrams = ngrams.withColumnRenamed('feature', 'tmp') ngrams = ngrams.drop(colName) ngrams = ngrams.withColumnRenamed('tmp', 'feature') return ngrams
def BuildPredictionVector(ngrams, lrmodels, ctxSize, vecSize): vecass = VectorAssembler(outputCol='vector') old_ngrams = ngrams #get prediction for each dimension incols = [] for dim in range(0, vecSize): lrm = lrmodels[dim] colName = 'prediction' + str(dim) incols.append(colName) ngrams = lrm.transform(ngrams).withColumnRenamed('prediction', colName) if dim == 0: ngrams = vecass.setParams(inputCols = [colName]).transform(ngrams) ngrams = ngrams.withColumnRenamed('vector', 'tmp') else: ngrams = vecass.setParams(inputCols = ['tmp', colName]).transform(ngrams).drop('tmp') ngrams = ngrams.withColumnRenamed('vector', 'tmp') ngrams = ngrams.drop(colName) #end getting prediction for each dimension ngrams = ngrams.withColumnRenamed('tmp', 'vector') ngrams = ngrams.drop('feature') return ngrams