def cleanUp():
	quiet_logs(sc_ = spark)
	#print("Start reading text data")
	hc, sqlc = hiveInit(sc_ = spark)
	groupedDict = globalMain(sc_= spark)
	print("start reading dictionary base")
	return
def mainOps(spark = None,baseSource = 'Not_ODS',snapshotsNeedread = False,todaysDate = 0):
	
	quiet_logs(sc_ = spark)
	#print("Start reading text data")
	hc, sqlc = hiveInit(sc_ = spark)
	groupedDict = globalMain(sc_= spark)
	print("start reading dictionary base")
	dictionaryBase = None
	if baseSource is 'ODS':
		final_list = groupedDict['20184521'] + groupedDict['20184522'] + groupedDict['20184523']
		print len(final_list)
		dictionaryBase = transferSnapshots(sqlc,final_list,spark )#'/npd/s_test2/dictionaryBase/')
		print "size of base :"  + str(dictionaryBase.count())
		print("start writing down dictionary base")
		deletePath('/npd/s_test2/dictionaryBase/',sc_ = spark)
		writeDown(dictionaryBase,'/npd/s_test2/dictionaryBase/')
		print ("end of writing down")
	if baseSource is 'ODS_BASE':
		dictionaryBase = startReadingfromhdfs(sqlc = sqlc,listOffiles = '/npd/s_test2/dictionaryBase/',multi = 1,spark=spark)
		deletePath('/npd/s_test2/dictionaryBase1/',sc_= spark)
		writeDown(dictionaryBase,'/npd/s_test2/dictionaryBase1/')

	if snapshotsNeedread is True:		
		deletePath('/npd/s_test2/snapshotFilestemp/',sc_= spark)
		snapshotIndex = 1
		totalsnapShots = sqlc.createDataFrame(spark.emptyRDD(),StructType([]))
		fileList = []
		for each in groupedDict.iteritems():
			if str(each[0]).find('201845') is not -1:
				print "Base files written on that day"
				continue
			print "snapshot id: " + str(each[0])
			#print "start reading " + str(snapshotIndex) + " snapshot"
			fileList = fileList + each[1]
		print fileList
		snapshotRdd = startReadingfromhdfs(sqlc = sqlc,listOffiles = fileList,spark = spark)
		print "start writing snapshot files"
		writeDown(snapshotRdd,'/npd/s_test2/snapshotFilestemp/')
	nondupSnapshotrdd = startOverlapdetector(snapshotRdd,['./createExternalTable.sql'],sqlc,hc,spark)
	#nondupBaserdd = startOverlapdetector(['./createExttabledictbase.sql'],sqlc,hc,spark)
	#colNames = columnRenaming(listNames)
	#snapshotBase = snapshotBase.selectExpr(colNames)
	array= nondupSnapshotrdd.select(['poi_id'])
	array = array.rdd.map(lambda x : x.poi_id).collect()
	#array = [lit(poi_id).alias("poi_id").cast("long") for poi_id  in array] 
	base = nondupBaserdd.where(~col("poi_id").isin(array)) 
	#base = startFilteringfrombase(nondupBaserdd,nondupSnapshotrdd)
	base = base.unionAll(nondupSnapshotrdd)
	base = base.withColumn("updated",base["updated"].cast("string"))
	base = base.withColumn("added",base["added"].cast("string"))
	print "dictionary base size: " + str(base.count())
	deletePath('/npd/s_test2/uniqueBasedictionary',sc_=spark)
	writeDown(base,'/npd/s_test2/uniqueBasedictionary')
	upc_map = generateUPCmap(rdddata=base)
	sku_map = generateSKUmap(rdddata=base)
	model_map = generateMODELmap(rdddata=base)
	return 
def updatePosoutletwithsnapshots(spark = None, ranges = 0, repartBase = 1, lastFilenumber = None, configOb = None, filtering = 1, table_index = 2 ):
	#if len(sys.argv) == 2:
        #	print("Usage: snapshotdetector number of days" + str(sys.argv[1]))
	#	ranges = int(sys.argv[1])
	print "Start working on updating the posoutlet table"
	quiet_logs(sc_ = spark)
	hc, sqlc = hiveInit(sc_ = spark)
	groupedDict = globalMain(pathName = configOb.input_path[2], sc_= spark)
	bis_list = configOb.businessSets
	dp = dynamicRepartition.factoryFunc(file_name = 'src/main/python/dictionary/configuration/resource_config')
	print("start reading snapshots for posoutlet")
	print "retrieve the table name"
	table_name = configOb.hivedbOb.get_dbName(index = 0) + "." + configOb.hivedbOb.get_tabNames(dbName_ = configOb.hivedbOb.get_dbName(index = 0), index = table_index)
	date_list, hr_ = rangeOfdate(ranges) if ranges is not 0 else todaysKey(delta = configOb.delta)
	totalsnapShots = sqlc.createDataFrame(spark.emptyRDD(),StructType([]))
	fileList = []
	fileList_for_base = []
	print "dateList is :" + str(date_list)
	for each in groupedDict.iteritems():
		if isSnapshotfiles(each[0].split()[0] ,date_list) is False:
			fileList_for_base = fileList_for_base + each[1]
			continue
		fileList = fileList + each[1]
	printwithstats(fileList)	
	if len(fileList )is 0:
		print "snapshot File list is emtpy for odsposoutlet table so..."
		print "Its empty exiting"
		exit(0)
	for each in fileList:
		print "Snapshot files: " + str(each)
	print ("Get the filtered files to read")
	needToread,last_file_num, fn = getLastfilenumber(fileList)
	print "Last file number for snapshot: " + str(last_file_num)
	if len(needToread) is 0 :
		print "snapshot files are all updated nothing to work on"
		print "application is exiting"
		sys.exit(0)
	fileList_for_base = filterBasefiles(fileList_for_base,lastfileNumber = fn)
	_, last_file_num_base, _ = getLastfilenumber(fileList_for_base, rev = True)
	print "Last file number for base: " + str(last_file_num_base)
	print ("Start reading snapshot files")
	stringedList = ",".join(needToread)
	snapshotRdd = startReadingfromhdfs(sqlc = sqlc, listOffiles = needToread, spark = spark, multi = 2)
	print "End of reading snapshot files"
	#print "start repartitioning the snapshsot file"
	#snapshotRdd = snapshotRdd.repartition(400)
	#print "End of repartition of snapshot dataframe"
	bis_list = [str(each) for each in bis_list] if bis_list is not None else None
	query = "select " +",".join(getColumnsname()) + ", partitioner  from dqdictionaryhivedb.uniqueodsposoutlet2_int" 
	query = query + "where business_id in (" +','.join(bis_list) + ")" if bis_list is not None else query
	baseDict = startReadingfromhdfs(sqlc = sqlc,listOffiles = fileList_for_base , multi = 2, spark = spark) if configOb.read_hive_odspos is 0 else startReadingfromhive(query = query, hc = hc, sqlc=sqlc,spark = spark)
	#baseDict = baseDict.coalesce(10000) if baseDict.rdd.getNumPartitions() > 10000 else baseDict
	#baseDict = baseDict.repartition(2500) if repartBase is 1 else baseDict
	#snapshotRdd = snapshotRdd.repartition(baseDict.rdd.getNumPartitions())
	listofC = getColumnsname()
	cols = columnRenaming(listofC)
	snapshotRdd = snapshotRdd.selectExpr(cols)
	bis_list = [ int(each) for each in bis_list] if bis_list is not None else None
	snapshotRdd = snapshotRdd.where(snapshotRdd.business_id.isin(bis_list)) if filtering is 1 else snapshotRdd
	snapshotWithpartition, itemidRdd = addingPartitionersnapshot(snapshotRdd , spark, sqlc)
	baseDict = baseDict.selectExpr(cols) if configOb.read_hive_odspos is 0 else baseDict
	baseDict = baseDict.where(baseDict.business_id.isin(set(bis_list))) if filtering is 1 else  baseDict
	print "Start repartitioning the base files"
	dictCount = baseDict.count()
	dp.set_data_size(dictCount)
	resizer = dp.dynamicPartitionresizer()
	actualSize = get_partition_number(baseDict,resizer,configOb.partition_size_change)
	baseDict = baseDict.repartition(actualSize) if actualSize is not 0 else baseDict
	print "End of repartitioning the base"
	print "Start of repartitioning the snapshots"
	snapCount = snapshotRdd.count()
	dp.set_data_size(snapCount)
	resizer = dp.dynamicPartitionresizer()
	actualSize = get_partition_number(snapshotRdd, resizer,configOb.partition_size_change )
	snapshotRdd = snapshotRdd.repartition(actualSize) if actualSize is not 0 else snapshotRdd
	print "End of repartitioing the snapshot files"
	#baseDict = baseDict.repartition(actualSize) if actualSize is not 0 else baseDict
	print "Start updating the posoutlet table"
	updateOdsposoutlet(snapshotRdd, baseDict = baseDict, itemidRdd = itemidRdd, process_dict = 1, spark = spark , ranges = ranges, readHdfs = configOb.read_hive_odspos, repartBase = repartBase, appendMode = 0, addpartitionCol = 1, process_zero = 0, fileList = fileList, rddwithPartition = snapshotWithpartition, lastFilenumber = last_file_num, table_name = table_name, hdfs_output = configOb.input_path[0],configOb = configOb )
	return
def readsnapshotBasetoupdate(spark=None,
                             ranges=2,
                             lastFilenumber=None,
                             configOb=None,
                             table_index=3,
                             filtering=0):
    print "initiate programe"
    quiet_logs(sc_=spark)
    print("Start reading text data")
    hc, sqlc = hiveInit(sc_=spark)
    bis_list = list(configOb.businessSets) if configOb is not None else None
    #print configOb.hivedbOb.get_tabNames(dbName_ = configOb.hivedbOb.get_dbNames(index = 0), index = 2)
    table_name = configOb.hivedbOb.get_dbName(
        index=0) + "." + configOb.hivedbOb.get_tabNames(
            dbName_=configOb.hivedbOb.get_dbName(index=0), index=table_index)
    print table_name
    groupedDict = globalMain(pathName=configOb.input_path[3], sc_=spark)
    snapshotIndex = 1
    date_list = []
    date_list, hr_ = rangeOfdate(ranges) if ranges is not 0 else todaysKey(
        delta=configOb.delta)
    print "dates are: " + str(date_list)
    totalsnapShots = sqlc.createDataFrame(spark.emptyRDD(), StructType([]))
    fileList = []
    fileList_for_base = []
    final_rdd = None
    for each in groupedDict.iteritems():
        if isSnapshotfiles(each[0].split()[0], date_list) is False:
            fileList_for_base = fileList_for_base + each[1]
            continue
        fileList = fileList + each[1]

    printwithstats(fileList)
    if len(fileList) is 0:
        print "snapshot files for odsitems are empty"
        print "Its empty exiting"
        exit(0)

    print("Get the filtered files to read")
    needToread, last_file_num, fn = getLastfilenumber(fileList)
    print "last snapshot file number: " + str(fn)
    fileList_for_base = filterBasefiles(fileList_for_base, lastfileNumber=fn)
    print "last file number for snapshot: " + str(last_file_num)
    _, last_file_num_for_base, _ = getLastfilenumber(fileList_for_base,
                                                     rev=True)
    print "last base file number : " + str(last_file_num_for_base)
    print("start reading snapshot files")
    snapshotRdd = startReadingfromhdfs(sqlc=sqlc,
                                       listOffiles=needToread,
                                       multi=2,
                                       spark=spark)
    query = "select distinct ITEMID, BUSINESSID, SUBCATEGORYN, ITEMNUMBER, UNITSPACKAGE, FLD01, FLD02, FLD03, FLD04, FLD05, FLD06, FLD07, FLD08, FLD09, FLD10, FLD11, FLD12, FLD13, FLD14, FLD15, FLD16, FLD17, FLD18, FLD19, FLD20, FLD21, FLD22, FLD23, FLD24,FLD25, FLD26, FLD27, FLD28, FLD29, FLD30, FLD31, FLD32, FLD33, FLD34, FLD35, FLD36, FLD37, FLD38, FLD39, FLD40, FLD41, FLD42, FLD43, FLD44, FLD45, FLD46, FLD47, FLD48, FLD49, FLD50, FLD51, FLD52, FLD53, FLD54, FLD55, FLD56, FLD57, FLD58, FLD59, FLD60, FLD61, FLD62, FLD63, FLD64, FLD65, FLD66, FLD67, FLD68, FLD69, FLD70, FLD71, FLD72, FLD73, FLD74, FLD75, FLD76, FLD77, FLD78, FLD79, FLD80, FLD81, FLD82, FLD83, FLD84, FLD85, FLD86, FLD87, FLD88, FLD89, FLD90, FLD91, FLD92, FLD93, FLD94, FLD95, FLD96, FLD97, FLD98, FLD99, STATUS, ADDED, UPDATED, VFLD01, VFLD02, VFLD03, VFLD04, VFLD05, COUNTRY_CODE, GROUPITEMID, PARENTITEMID, PARENTITEMID_STATUS, OUTLETITEM_MAP_CHANGE_DATE, LOCKDOWN_STATUS from dqdictionaryhivedb.uniqueodspositems2_int"
    bis_list = [str(each)
                for each in bis_list] if bis_list is not None else None
    query = query + " where BUSINESSID in (" + ','.join(
        bis_list) + ")" if configOb is not None else query
    baseDict = startReadingfromhdfs(
        sqlc=sqlc, listOffiles=fileList_for_base, multi=2, spark=spark
    ) if configOb.read_hive_odsitem is 0 else startReadingfromhive(
        query=query, hc=hc, sqlc=sqlc, spark=spark)
    listofC = getColumnsnameods()
    cols = columnRenaming(listofC)
    baseDict = baseDict.selectExpr(
        cols) if configOb.read_hive_odsitem is 0 else baseDict
    snapshotRdd = snapshotRdd.selectExpr(cols)
    bis_list = [int(each)
                for each in bis_list] if bis_list is not None else None
    snapshotRdd = snapshotRdd.where(snapshotRdd.businessid.isin(
        set(bis_list))) if filtering is 1 else snapshotRdd
    baseDict = baseDict.where(baseDict.businessid.isin(
        set(bis_list))) if filtering is 1 else baseDict
    #baseDict = baseDict.withColumn("added",to_timestamp("added","yyyy_MM_dd hh_mm_ss"))
    #baseDict = baseDict.withColumn("updated",to_timestamp("updated","yyyy_MM_dd hh_mm_ss"))
    #baseDict = baseDict.withColumn("outletitem_map_change_date",to_timestamp("outletitem_map_change_date","yyyy_MM_dd hh_mm_ss"))
    print "snapshot data count started"
    startWorkingsnapshots(snapshotRdd=snapshotRdd,
                          baseDict=baseDict,
                          spark=spark,
                          ranges=ranges,
                          process_dict=1,
                          dict_hdfs=0,
                          dict_hive=configOb.read_hive_odsitem,
                          writebackType=0,
                          debug=0,
                          fileList=fileList,
                          lastFilenumber=lastFilenumber,
                          table_name=table_name,
                          hdfs_output=configOb.input_path[1],
                          writeTohdfs=0,
                          append_in_hive=configOb.append_in_hive,
                          updatehivetable=configOb.stage['updatehivetable'])
    return