def create_sample_file_library(): """Utility function to create the sample file library""" import os, pickle from django.core.files import File as DjangoFile prefix = "/var/www/sample/" sample_files = ["celebrity.txt", "birdNybirdExtracted.txt", "dbgen-1k_input.txt", "restaurant.txt", "dbgen_50k_input.txt"] for fn in sample_files: with open(prefix + fn) as f: lines = f.readlines() f.seek(0) rows = len(lines) columns = len(lines[0].split("\t" if "\t" in lines[0] else ",")) statinfo = os.stat(prefix + fn) size = statinfo.st_size uf = UserFile(user=User.objects.get(id=1), input_file=DjangoFile(f), size=size, rows=rows, columns=columns, type="S") uf.save()
def save_results_mp(job, user, q): """Create final results file. The rows in marked_rows will not be included in the final file. If any edited results exist in the EditedResult database, a new row will be created with the contents of the EditedResult. marked_rows - array of row IDs to be deleted """ connection.close() #needed bc Django + multiprocessing = messy results_csv = [] original = get_string_from_s3(USER_FILE_BUCKET, job.get_input_file().name).split('\n') marker = '\t' if '\t' in original[0] else ',' ncols = job.get_user_file().columns #generate data file for i, line in enumerate(original): if line == "\n": continue edited = [] try: marked_delete = False if EditedResult.objects.get(job=job, local_id=(i+1)).value.lower() == "false" else True except EditedResult.DoesNotExist: marked_delete = False except EditedResult.MultipleObjectsReturned: marked_delete = False if EditedResult.objects.filter(job=job, local_id=(i+1))[0].value.lower() == "false" else True if not marked_delete: #if row is checked, don't include in final data file line = line.split(marker) if len(line) != ncols: edited = ','.join(line) else: for j in range(ncols): try: edited.append(EditedResult.objects.get(job=job, local_id='%i-%i' % (i+1, j)).value) except EditedResult.DoesNotExist: edited.append(line[j]) except EditedResult.MultipleObjectsReturned: edited.append(EditedResult.objects.filter(job=job, local_id='%i-%i' % (i+1, j))[0].value) edited = ','.join(edited) results_csv.append(edited) if float(i)/len(original) % 10./len(original) == 0: q.put(float(i)/len(original)) nrows = len(results_csv) results_csv = "\n".join(results_csv) latest = UserFile.objects.latest('id') output_file = ContentFile(results_csv) output_file.name = "%s.%i.%i.cleaned.csv" % (".".join(job.get_input_file().name.split(".")[:-1]), job.id, latest.id + 1) try: uf = UserFile.filter(jobs=job).order_by('id').reverse()[0] uf.input_file = output_file uf.save() except: uf = UserFile(input_file=output_file, user=user, size=output_file.size, rows=nrows, columns=ncols, type="O") uf.save() finally: uf.jobs.add(job) uf.save() job.set_status("results") q.put(100.) email_body = lambda name: \ """ Hello, The final results for your file cleaning job, %s are ready! You can download your results here: http://dedool.com/files/ Thank you, The Dedool.com team """ % (name) send_mail("Dedool.com Final Results for Job %s Ready!" % job.name, email_body(job.name), "*****@*****.**", [job.user.email])