def add_by_hand(request): '''A form to add regulations that can't be scraped. Currently, this is only used for HTML files. ''' # add HTML file to folder if request.method == 'POST': # validate and save form data form = RegulationForm(request.POST) if form.is_valid(): regulation = form.save() generate_regulation_pages(regulation) return redirect('parser_tools.views.add_by_hand') else: message = 'Your submission was invalid.' return render_to_response('parser_tools/add_by_hand.html', {'message': message}, context_instance=RequestContext(request)) else: form = RegulationForm() return render_to_response('parser_tools/add_by_hand.html', {'form': form}, context_instance=RequestContext(request))
def scrape(request): if request.method == 'POST': form = ScrapeForm(request.POST) if form.is_valid(): # Retrieve form data parent_url = form.cleaned_data['url'] parent_title = form.cleaned_data['title'] # Initialize output string output = '' # Create temporary directory to contain downloaded files os.system("mkdir " + DATA_PATH + "tmp/") # Download PDFs from the specified URL to the temporary directory output = output + "<h3>Downloading files</h3>" os.system("wget -o " + DATA_PATH + "tmp/wget.log -P " + DATA_PATH + "tmp/ -r -l1 -A.pdf \ -H -D.gov,.us -nd -np " + parent_url) os.system("rm " + DATA_PATH + "tmp/robots.*") output = output + "<p>Done.</p>" # Create ordered dictionary of url/filename pairs # Format: urls_and_filenames['filename.pdf'] = 'http://path/to/file/' urls_and_filenames = collections.OrderedDict() output = output + '<h3>Generating list of downloaded URLs</h3>' with file(DATA_PATH + 'tmp/wget.log') as f: log = f.read() downloaded_urls = re.findall(r'(https?://\S+)', log) for this_url in downloaded_urls: # Only keep URLs for PDF files if this_url[-4:] == '.pdf': filename = this_url.rsplit('/', 1)[1] urls_and_filenames[filename] = this_url output = output + 'Kept ' + this_url + '<br />' else: output = output + '<em>Not a pdf:' + this_url + '</em><br />' output = output + '<strong>removed</strong><br />' output = output + '<h3>Filename/URL pairs</h3>' \ + str(urls_and_filenames) # For each downloaded file: check for naming conflicts, create # Regulation object, and move file out of tmp directory output = output + '<h3>Processing downloaded files</h3>' for this_filename, this_url in urls_and_filenames.items(): output = output + "Checking " + this_filename + "... " current_files = os.listdir(DATA_PATH) if this_filename in current_files: # Rename conflicting files output = output + "<strong>conflict</strong><br />" for n in range(1, 100): renamed_file = this_filename[:-4] + '-' + str(n) + '.pdf' if renamed_file not in current_files: # We've found a unique filename; end the loop and # keep the current value in renamed_file output = output + '...renaming to ' + renamed_file \ + '<br />' break # Rename file in the filesystem os.system('mv ' + DATA_PATH + 'tmp/' + this_filename + \ ' ' + DATA_PATH + 'tmp/' + renamed_file + '<br />') # Rename the file in the current iteration of the loop through downloaded files # This value will be used in creating the new Regulation object this_filename = renamed_file else: output = output + "no conflict<br />" # Move file out of tmp directory os.system('mv ' + DATA_PATH + 'tmp/' + this_filename + ' ' + DATA_PATH) # Create new Regulation object output = output + '...creating Django objects for ' + \ this_filename + ' / ' + this_url + '<br />' new_regulation = Regulation( parent_title=parent_title, # from the POST form data parent_url=parent_url, # from the POST form data url=this_url, media='PDF', filename=this_filename, ) new_regulation.save() # Create Page objects for each page in the regulation # This function lives in helper.py generate_regulation_pages(new_regulation) # Delete wget log, move files out of tmp directory, and delete tmp output = output + '<h3>Cleaning up</h3>' os.system('rm ' + DATA_PATH + 'tmp/wget.log') os.system('rmdir ' + DATA_PATH + 'tmp/') output = output + '<p>Done.</p>' return render_to_response('parser_tools/scrape.html', {'output': output}, context_instance=RequestContext(request)) else: return HttpResponse('Your submission is invalid.') else: form = ScrapeForm return render_to_response('parser_tools/scrape.html', {'form': form, 'DATA_PATH': DATA_PATH}, context_instance=RequestContext(request))