Exemplo n.º 1
0
def handle_submission(request):
    if request.method == 'GET':
        data = {}
        return render_to_response('submit.htm', data)
    elif request.method == 'POST':
        url = request.POST['url']
        email = request.POST['email']
        
        msg = 'Thank you for your submission.'
        
        # url normalization
        norm_url = crawler.url_normalization.get_canonical_url(url)
        
        if norm_url == '':
            data = {                    
                'err': 'Invalid URL.'
            }
            return render_to_response('submit.htm', data)                                
        
        if len(norm_url) > 255:
            data = {                    
                'err': 'The lengt of the URL cannot exceed 255.'
            }
            return render_to_response('submit.htm', data)
        
        # check if already exist
        p = None
        old_sub = None
        new_sub = None
        
        try:
            p = ParentUrl.objects.get(url=norm_url)            
            batch = 0 # added by jwu 03/20/2012
        except ParentUrl.DoesNotExist:
            try: 
                old_sub = Submission.objects.get(url=norm_url)                            
                batch = old_sub.id # added by jwu 03/20/2012
            except Submission.DoesNotExist:
                # only new urls will be saved to submission database
                new_sub = Submission(url=norm_url, email=email)
                new_sub.save()                        
                batch = new_sub.id # added by jwu 03/20/2012
        
        # submitted url will be recrawled (even it's old)
        #batch = int(datetime.datetime.now().strftime('%Y%m%d')) # commented out by jwu 03/20/2012
        
        s = crawler.submit.Submitter(config.amq_host, 61613, config.amq_queue)
        s.connect_mq()
        s.submit(url, batch)
        s.disconnect_mq()
        
        data = {                    
            'parent': p,
            'old_sub': old_sub,
            'new_sub': new_sub, 
            'msg': msg
        }        
        
        return render_to_response('submit.htm', data)
Exemplo n.º 2
0
def handle_submission_pub(request):
    logging.basicConfig(level=logging.DEBUG,\
                        format="%(asctime)s %(name)s %(levelname)s %(message)s")
    logfh = logging.FileHandler("/data/tmp/handle_submission_pub.log",mode="a")
    logging.getLogger("").addHandler(logfh)
    logger = logging.getLogger("handle_submission_pub")
    if request.method == 'GET':
        logger.debug("reqeust method is GET")
        form = CaptchaForm_pub()
        data = {'form': form}
        return render_to_response('submit_pub.htm', data)
    elif request.method == 'POST':
        logger.debug("reqeust method is POST")
        form = CaptchaForm_pub(request.POST)
        if form.is_valid():
            url = form.cleaned_data['url']
            email = form.cleaned_data['email']
            submitter_name = form.cleaned_data["submitter_name"]

            msg = 'Thank you for your submission.'

            norm_url = crawler.url_normalization.get_canonical_url(url)

            if norm_url =='':
                data = {
                   'err': 'Invalid URL.',
                   'form': form
                }
                return render_to_response('submit_pub.htm', data)

            if len(norm_url) > 255:
                data = {
                   'err': 'The length of the URL canno exceed 255.',
                   'form': form
                }
                return render_to_response('submit_pub.htm', data)

            # check if already exist
            p = None
            old_sub = None
            new_sub = None

            try:
                p = ParentUrl.objects.get(url=norm_url)
                logger.debug("URL found in submission_pub table")
            except ParentUrl.DoesNotExist:
                try:
                    old_sub = Submission.objects.get(url=norm_url)
                    
                except Submission.DoesNotExist:
                    logger.debug("URL inserted into submission_pub table")
                    # only new urls will be saved to submission_pub database
                    new_sub = Submission(url=norm_url, email=email, submitter_name=submitter_name)
                    new_sub.save()

            # submitted url will be recrawled (even it's old)
            batch = int(datetime.datetime.now().strftime('%Y%m%d'))

            data = {
                'parent': p,
                'old_sub': old_sub,
                'new_sub': new_sub,
                'msg': msg,
                'form': form
            }

            return render_to_response('submit_pub.htm', data)
        else:
            data = {'form': form}
            return render_to_response('submit_pub.htm', data)
Exemplo n.º 3
0
def handle_submission(request):
    if request.method == 'GET':
        form = CaptchaForm()
        data = {'form': form}
        return render_to_response('submit.htm', data)
    elif request.method == 'POST':
        form = CaptchaForm(request.POST)
        if form.is_valid():
            url = form.cleaned_data['url']
            email = form.cleaned_data['email']

            msg = 'Thank you for your submission.'

            norm_url = crawler.url_normalization.get_canonical_url(url)

            if norm_url =='':
                data = {
                   'err': 'Invalid URL.',
                   'form': form
                }
                return render_to_response('submit.htm', data)

            if len(norm_url) > 255:
                data = {
                   'err': 'The length of the URL canno exceed 255.',
                   'form': form
                }
                return render_to_response('submit.htm', data)

            # check if already exist
            p = None
            old_sub = None
            new_sub = None

            try:
                p = ParentUrl.objects.get(url=norm_url)
            except ParentUrl.DoesNotExist:
                try:
                    old_sub = Submission.objects.get(url=norm_url)

                except Submission.DoesNotExist:
                    # only new urls will be saved to submission database
                    new_sub = Submission(url=norm_url, email=email)
                    new_sub.save()

            # submitted url will be recrawled (even it's old)
            batch = int(datetime.datetime.now().strftime('%Y%m%d'))

            #s = crawler.submit.Submitter(config.amq_host, 61613, config.amq_queue)
            #s.connect_mq()
            #s.submit(url, batch)
            #s.disconnect_mq()

            data = {
                'parent': p,
                'old_sub': old_sub,
                'new_sub': new_sub,
                'msg': msg,
                'form': form
            }

            return render_to_response('submit.htm', data)
        else:
            data = {'form': form}
            return render_to_response('submit.htm', data)