예제 #1
0
def BuildSubstringFeature(ngrams, w2v, start, end, ctxSize, lookahead, stringCol='sentence'):
	gramSize = GramSize(ctxSize, lookahead)

	vecass = VectorAssembler(outputCol='feature')

	old_ngrams = ngrams


	#create a vector column for each context position 
	for ctxpos in range(start, end):

		#create a column to hold the vector for this context position
		colName = 'ctx' + str(ctxpos)

		#create the vector for the context position
		ngrams = CreateSubstring(ngrams, stringCol, 'ngrams', gramSize, ' ', ctxpos, ctxpos + 1, True)
		

		ngrams = w2v.transform(ngrams).withColumnRenamed('vector', colName).drop('ngrams')
		
	
		if ctxpos == start:
			ngrams = vecass.setParams(inputCols = [colName]).transform(ngrams)
			ngrams = ngrams.withColumnRenamed('feature', 'tmp')
		else:
			ngrams = vecass.setParams(inputCols = ['tmp', colName]).transform(ngrams).drop('tmp')
			ngrams = ngrams.withColumnRenamed('feature', 'tmp')
			
		ngrams = ngrams.drop(colName)


	ngrams = ngrams.withColumnRenamed('tmp', 'feature')	
	return ngrams
예제 #2
0
def BuildPredictionVector(ngrams, lrmodels, ctxSize, vecSize):
	vecass = VectorAssembler(outputCol='vector')

	old_ngrams = ngrams

	#get prediction for each dimension
	incols = []
	for dim in range(0, vecSize):
		lrm = lrmodels[dim]

		colName = 'prediction' + str(dim)
		incols.append(colName)

		

		ngrams = lrm.transform(ngrams).withColumnRenamed('prediction', colName)

		if dim == 0:
			ngrams = vecass.setParams(inputCols = [colName]).transform(ngrams)
			ngrams = ngrams.withColumnRenamed('vector', 'tmp')
		else:
			ngrams = vecass.setParams(inputCols = ['tmp', colName]).transform(ngrams).drop('tmp')
			ngrams = ngrams.withColumnRenamed('vector', 'tmp')
			
		ngrams = ngrams.drop(colName)

		

	#end getting prediction for each dimension

	ngrams = ngrams.withColumnRenamed('tmp', 'vector')	
	ngrams = ngrams.drop('feature')




	return ngrams