예제 #1
0
def create_sample_file_library():
	"""Utility function to create the sample file library"""
	import os, pickle
	from django.core.files import File as DjangoFile
	
	prefix = "/var/www/sample/"
	sample_files = ["celebrity.txt", "birdNybirdExtracted.txt", "dbgen-1k_input.txt", "restaurant.txt", "dbgen_50k_input.txt"]
	
	for fn in sample_files:
		with open(prefix + fn) as f:		
			lines = f.readlines()
			f.seek(0)
			rows = len(lines)
			columns = len(lines[0].split("\t" if "\t" in lines[0] else ","))
			statinfo = os.stat(prefix + fn)
			size = statinfo.st_size
						
			uf = UserFile(user=User.objects.get(id=1), input_file=DjangoFile(f), size=size, rows=rows, columns=columns, type="S")
			uf.save()
예제 #2
0
def save_results_mp(job, user, q):
	"""Create final results file. The rows in marked_rows will not be included in the final file. If any edited results exist in the EditedResult database, a new row will be created with the contents of the EditedResult.
	
	marked_rows - array of row IDs to be deleted
	"""
	connection.close() #needed bc Django + multiprocessing = messy
	results_csv = []
	
	original = get_string_from_s3(USER_FILE_BUCKET, job.get_input_file().name).split('\n')
	marker = '\t' if '\t' in original[0] else ','
	ncols = job.get_user_file().columns
	
	#generate data file
	for i, line in enumerate(original):
		if line == "\n": continue
		edited = []	
		try:
			marked_delete = False if EditedResult.objects.get(job=job, local_id=(i+1)).value.lower() == "false" else True
		except EditedResult.DoesNotExist:
			marked_delete = False
		except EditedResult.MultipleObjectsReturned:
			marked_delete = False if EditedResult.objects.filter(job=job, local_id=(i+1))[0].value.lower() == "false" else True
			
		if not marked_delete:
			#if row is checked, don't include in final data file
			line = line.split(marker)
			if len(line) != ncols:
				edited = ','.join(line)
			else:	
				for j in range(ncols):
					try:
						edited.append(EditedResult.objects.get(job=job, local_id='%i-%i' % (i+1, j)).value)
					except EditedResult.DoesNotExist:
						edited.append(line[j])
					except EditedResult.MultipleObjectsReturned:
						edited.append(EditedResult.objects.filter(job=job, local_id='%i-%i' % (i+1, j))[0].value)
				edited = ','.join(edited)
			results_csv.append(edited)
		if float(i)/len(original) % 10./len(original) == 0:
			q.put(float(i)/len(original))
	nrows = len(results_csv)
	results_csv = "\n".join(results_csv)
	
	latest = UserFile.objects.latest('id')
	output_file = ContentFile(results_csv)
	output_file.name = "%s.%i.%i.cleaned.csv" % 	(".".join(job.get_input_file().name.split(".")[:-1]), job.id, latest.id + 1)

	try:
		uf = UserFile.filter(jobs=job).order_by('id').reverse()[0]
		uf.input_file = output_file
		uf.save()
	except:
		uf = UserFile(input_file=output_file, user=user, size=output_file.size, rows=nrows, columns=ncols, type="O")
		uf.save()
	finally:
		uf.jobs.add(job)
		uf.save()

	job.set_status("results")
	q.put(100.)
	email_body = lambda name: \
"""
Hello,

The final results for your file cleaning job, %s are ready! You can download your results here:
http://dedool.com/files/

Thank you,
The Dedool.com team
""" % (name)	
	send_mail("Dedool.com Final Results for Job %s Ready!" % job.name, email_body(job.name), "*****@*****.**", [job.user.email])