Exemplo n.º 1
0
def CreateTrainDev(in_direct, out_direct_train, out_direct_test, train_ratio, test_ratio, train_doc_file, out_train_tbl):

	cores = 2
	kerntype = "epanech"
	kern_dist = 100000
	file_type = "lgl"

	def SplitFiles(in_direct, out_direct_train, out_direct_test, train_ratio, test_ratio):
		train_files = 0
		test_files = 0

		if not os.path.exists(out_direct_train):
			os.makedirs(out_direct_train)
		if not os.path.exists(out_direct_test):
			os.makedirs(out_direct_test)

		files = os.listdir(in_direct)

		shuffle(files)

		#print files

		for f in files:
			test_xml = in_direct + "/" + f
			if float(test_files) <= float(train_files) * test_ratio and train_files >= 1:
				ouf = out_direct_test+"/"+f
				with io.open(ouf, 'w', encoding='utf-8') as w:
					rf = io.open(test_xml, 'r', encoding='utf-8')
					w.write(rf.read())
					rf.close()
				test_files += 1
			else: 
				ouf = out_direct_train+"/"+f
				with io.open(ouf, 'w', encoding='utf-8') as w:
					rf = io.open(test_xml, 'r', encoding='utf-8')
					w.write(rf.read())
					rf.close()
				train_files += 1
		print "Test Files :", test_files
		print "Train Files: ", train_files


	SplitFiles(in_direct, out_direct_train, out_direct_test, train_ratio, test_ratio)

	#Create Pseudo-Documents from Toponyms
	import PseudoDocCreator as PSD
	PSD.calc(out_direct_train, train_doc_file, 100, file_type)

	#Create Document Table
	import LoadDBV1 as LDB
	LDB.Load(train_doc_file, out_train_tbl, conn_info, "wiki")

	#Calc Local Stats Tables
	import LocalSpatialStatsV1 as LS
	train_tbl_part1 = out_train_tbl+"_kernel"+str(kern_dist/1000)+"k"+"_"+kerntype
	LS.calc(train_doc_file, "gi", out_train_tbl, gtbl, conn_info, "DummyOutfile.txt", train_tbl_part1, kern_dist, kerntype, "wiki", False, "any", 0, cores, False)
	#LSS.calc(f, statistic, dtbl, gtbl, conn_info, outf, out_tbl, kern_dist, kerntype, traintype, listuse, whitelist_file, grid_min, cores, include_zero)
	train_tbl = train_tbl_part1 + "_gi"
	return train_tbl, out_direct_test
Exemplo n.º 2
0
        except:
            print "Did not provide a valid kerntype option, defaulting to uniform"
            kerntype = "uniform"

        #Should probabilities of zero be written to tbl? (yes for similarity scores, no for Top Resolver)
        try:
            include_zero = args[args.index('-include_zero')+1]
            if include_zero.lower() == "false":
                include_zero = False
            else: include_zero = True
        except:
            print "Did not provide include zero argument, defaulting to True"
            include_zero = True


        LSS.calc(f, statistic, dtbl, gtbl, conn_info, outf, out_tbl, kern_dist, kerntype, traintype, listuse, whitelist_file, grid_min, cores, include_zero)
        

    ##########################Load a database with | Doc ID | Geometry | table#####################
    if mode_arg.lower() == "loaddb":
        import LoadDBV1 as loadDB
        print "Starting DB Load Process"

        if '-tf' in args:
            f = args[args.index("-tf")+1]
        elif '-df' in args:
            f = args[args.index("-df")+1]
        elif '-tstf' in args:
            f = args[args.index("-tstf")+1]
            
        try: