示例#1
0
def add_by_hand(request):
    '''A form to add regulations that can't be scraped.

    Currently, this is only used for HTML files.
    '''
    # add HTML file to folder
    if request.method == 'POST':
        # validate and save form data
        form = RegulationForm(request.POST)
        if form.is_valid():
            regulation = form.save()
            generate_regulation_pages(regulation)
            return redirect('parser_tools.views.add_by_hand')
        else:
            message = 'Your submission was invalid.'
            return render_to_response('parser_tools/add_by_hand.html',
                                      {'message': message},
                                      context_instance=RequestContext(request))
    else:
        form = RegulationForm()
        return render_to_response('parser_tools/add_by_hand.html',
                                  {'form': form},
                                  context_instance=RequestContext(request))
示例#2
0
def scrape(request):
    if request.method == 'POST':
        form = ScrapeForm(request.POST)
        if form.is_valid():
            # Retrieve form data
            parent_url = form.cleaned_data['url']
            parent_title = form.cleaned_data['title']

            # Initialize output string
            output = ''

            # Create temporary directory to contain downloaded files
            os.system("mkdir " + DATA_PATH + "tmp/")

            # Download PDFs from the specified URL to the temporary directory
            output = output + "<h3>Downloading files</h3>"
            os.system("wget -o " + DATA_PATH + "tmp/wget.log -P " + DATA_PATH + "tmp/ -r -l1 -A.pdf \
            -H -D.gov,.us -nd -np " + parent_url)
            os.system("rm " + DATA_PATH + "tmp/robots.*")
            output = output + "<p>Done.</p>"

            # Create ordered dictionary of url/filename pairs
            # Format: urls_and_filenames['filename.pdf'] = 'http://path/to/file/'
            urls_and_filenames = collections.OrderedDict()
            output = output + '<h3>Generating list of downloaded URLs</h3>'
            with file(DATA_PATH + 'tmp/wget.log') as f:
                log = f.read()
            downloaded_urls = re.findall(r'(https?://\S+)', log)

            for this_url in downloaded_urls:
                # Only keep URLs for PDF files
                if this_url[-4:] == '.pdf':
                    filename = this_url.rsplit('/', 1)[1]
                    urls_and_filenames[filename] = this_url        
                    output = output + 'Kept ' + this_url + '<br />'
                else:
                    output = output + '<em>Not a pdf:' + this_url + '</em><br />'
                    output = output + '<strong>removed</strong><br />'
            output = output + '<h3>Filename/URL pairs</h3>' \
                + str(urls_and_filenames)

            # For each downloaded file: check for naming conflicts, create
            # Regulation object, and move file out of tmp directory
            output = output + '<h3>Processing downloaded files</h3>'
            for this_filename, this_url in urls_and_filenames.items():
                output = output + "Checking " + this_filename + "... "
                current_files = os.listdir(DATA_PATH)
                if this_filename in current_files:
                    # Rename conflicting files
                    output = output + "<strong>conflict</strong><br />"
                    for n in range(1, 100):
                        renamed_file = this_filename[:-4] + '-' + str(n) + '.pdf'
                        if renamed_file not in current_files:
                            # We've found a unique filename; end the loop and
                            # keep the current value in renamed_file
                            output = output + '...renaming to ' + renamed_file \
                                + '<br />'
                            break
                    # Rename file in the filesystem
                    os.system('mv ' + DATA_PATH + 'tmp/' + this_filename + \
                        ' ' + DATA_PATH + 'tmp/' + renamed_file + '<br />')

                    # Rename the file in the current iteration of the loop through downloaded files
                    # This value will be used in creating the new Regulation object
                    this_filename = renamed_file
                else:
                    output = output + "no conflict<br />"

                # Move file out of tmp directory
                os.system('mv ' + DATA_PATH + 'tmp/' + this_filename + ' ' + DATA_PATH)

                # Create new Regulation object
                output = output + '...creating Django objects for ' + \
                    this_filename + ' / ' + this_url + '<br />'
                new_regulation = Regulation(
                    parent_title=parent_title, # from the POST form data
                    parent_url=parent_url, # from the POST form data
                    url=this_url,
                    media='PDF',
                    filename=this_filename,
                )
                new_regulation.save()

                # Create Page objects for each page in the regulation
                # This function lives in helper.py
                generate_regulation_pages(new_regulation)
 
            # Delete wget log, move files out of tmp directory, and delete tmp
            output = output + '<h3>Cleaning up</h3>'
            os.system('rm ' + DATA_PATH + 'tmp/wget.log')
            os.system('rmdir ' + DATA_PATH + 'tmp/')
            output = output + '<p>Done.</p>'
            return render_to_response('parser_tools/scrape.html',
                                      {'output': output},
                                      context_instance=RequestContext(request))

        else:
            return HttpResponse('Your submission is invalid.')
    else:
        form = ScrapeForm
    return render_to_response('parser_tools/scrape.html',
                              {'form': form, 'DATA_PATH': DATA_PATH},
                              context_instance=RequestContext(request))