def populate_pages(url_list, category, halved_screen_shot=False):
    """

    :param url_list: a list of the urls for the pages that are going to be populated
    :param category: the category in which the pages fall into
    :return:
    """

    #For each url in the url_list
    f = open('page_meta_data.txt','a')
    for url in url_list:

        p = Page.objects.filter(url=url)
        pf = None
        if p:
            pf =p[0]
        if not pf:
            # create PageCapture object - specify the browser to be 800 x 600.
            try:
                pc = PageCapture(url,800, 600)
                url_file_name = convert_url_to_filename(url)+'.png'
                # To change to accomodate for the new changes
                image_file_name = os.path.join(DATA_DIR, url_file_name)
                pc.load_url(url)
                # fetch the screen-shot
                if halved_screen_shot:
                    pc.crop_screen_shot(image_file_name,0,0,1000,1000)
                    #pc.halve_screen_shot(image_file_name)
                else:
                    pc.take_screen_shot(image_file_name)

                # get the title
                title = pc.get_page_title()
                # create page in models/db with category
                # Abdullah , using DATA_DIR did not work for me because it uses the current working directory in the url.

                #save to file instead of db here to decouple.
                f.write('%s,%s,%s,%s\n' % (category.name, url, title,image_file_name,))
                print("written {0} to file.".format(title))

                p = Page(category=category, title=title, is_shown=True, url=url, screenshot=os.path.join('/', MEDIA_ROOT, url_file_name))
                p.save()
                print 'Page title= ' + p.title + ' has been saved!'

            except ValueError:
                print 'Page  has ((NOT)) been saved!'
                print 'ERROR IS {0}'.format("ValueError")
                continue
        else:
            print "Already added: {0}".format(pf.title, pf.url)
    f.close()
示例#2
0
def populate_pages(url_list, category, halved_screen_shot=False):
    """

    :param url_list: a list of the urls for the pages that are going to be populated
    :param category: the category in which the pages fall into
    :return:
    """

    #For each url in the url_list
    for url in url_list:

        # create PageCapture object - specify the browser to be 800 x 600.
        try:
            pc = PageCapture(url, 800, 600)
            url_file_name = convert_url_to_filename(url) + '.png'
            # To change to accomodate for the new changes
            image_file_name = os.path.join(DATA_DIR, url_file_name)
            pc.load_url(url)
            # fetch the screen-shot
            if halved_screen_shot:
                if random.random() > 0.5:
                    pc.crop_screen_shot(image_file_name, 200, 400, 700, 900)
                else:
                    pc.crop_screen_shot(image_file_name, 0, 0, 1000, 1000)
                #pc.halve_screen_shot(image_file_name)
            else:
                pc.take_screen_shot(image_file_name)

            # get the title
            title = pc.get_page_title()
            # create page in models/db with category
            # Abdullah , using DATA_DIR did not work for me because it uses the current working directory in the url.
            p = Page(category=category,
                     title=title,
                     is_shown=True,
                     url=url,
                     screenshot=os.path.join('/', MEDIA_ROOT, url_file_name))
            p.save()
            print 'Page title= ' + p.title + ' has been saved!'
        except ValueError:
            print 'Page  has ((NOT)) been saved!'
            print 'ERROR IS'
            print ValueError
            continue
示例#3
0
def populate_pages(url_list, category, halved_screen_shot=False):
    """

    :param url_list: a list of the urls for the pages that are going to be populated
    :param category: the category in which the pages fall into
    :return:
    """

    #For each url in the url_list
    for url in url_list:

        # create PageCapture object - specify the browser to be 800 x 600.
        try:
            pc = PageCapture(url,800, 600)
            url_file_name = convert_url_to_filename(url)+'.png'
            # To change to accomodate for the new changes
            image_file_name = os.path.join(DATA_DIR, url_file_name)
            pc.load_url(url)
            # fetch the screen-shot
            if halved_screen_shot:
                if random.random() > 0.5:
                    pc.crop_screen_shot(image_file_name,200,400,700,900)
                else:
                    pc.crop_screen_shot(image_file_name,0,0,1000,1000)
                #pc.halve_screen_shot(image_file_name)
            else:
                pc.take_screen_shot(image_file_name)

            # get the title
            title = pc.get_page_title()
            # create page in models/db with category
            # Abdullah , using DATA_DIR did not work for me because it uses the current working directory in the url.
            p = Page(category=category, title=title, is_shown=True, url=url, screenshot=os.path.join('/', MEDIA_ROOT, url_file_name))
            p.save()
            print 'Page title= ' + p.title + ' has been saved!'
        except ValueError:
            print 'Page  has ((NOT)) been saved!'
            print 'ERROR IS'
            print ValueError
            continue
def populate_pages(url_list, category, halved_screen_shot=False):
    """

    :param url_list: a list of the urls for the pages that are going to be populated
    :param category: the category in which the pages fall into
    :return:
    """

    #For each url in the url_list
    f = open('page_meta_data.txt', 'a')
    for url in url_list:

        p = Page.objects.filter(url=url)
        pf = None
        if p:
            pf = p[0]
        if not pf:
            # create PageCapture object - specify the browser to be 800 x 600.
            try:
                pc = PageCapture(url, 800, 600)
                url_file_name = convert_url_to_filename(url) + '.png'
                # To change to accomodate for the new changes
                image_file_name = os.path.join(DATA_DIR, url_file_name)
                pc.load_url(url)
                # fetch the screen-shot
                if halved_screen_shot:
                    pc.crop_screen_shot(image_file_name, 0, 0, 1000, 1000)
                    #pc.halve_screen_shot(image_file_name)
                else:
                    pc.take_screen_shot(image_file_name)

                # get the title
                title = pc.get_page_title()
                # create page in models/db with category
                # Abdullah , using DATA_DIR did not work for me because it uses the current working directory in the url.

                #save to file instead of db here to decouple.
                f.write('%s,%s,%s,%s\n' % (
                    category.name,
                    url,
                    title,
                    image_file_name,
                ))
                print("written {0} to file.".format(title))

                p = Page(category=category,
                         title=title,
                         is_shown=True,
                         url=url,
                         screenshot=os.path.join('/', MEDIA_ROOT,
                                                 url_file_name))
                p.save()
                print 'Page title= ' + p.title + ' has been saved!'

            except ValueError:
                print 'Page  has ((NOT)) been saved!'
                print 'ERROR IS {0}'.format("ValueError")
                continue
        else:
            print "Already added: {0}".format(pf.title, pf.url)
    f.close()