예제 #1
0
 def test_crawl_mock_pages_all_products_no_repetitions(self):
     mock_params = {
         '/produto_inicial/p':
         ('Pagina Inicial', 'Produto Inicial', 'produto_1/p', 'produto_2/p',
          'produto_3/p'),
         '/produto_1/p': ('Pagina Produto 1', 'Produto 1', 'produto_4/p',
                          'produto_5/p', 'produto_6/p'),
         '/produto_2/p': ('Pagina Produto 2', 'Produto 2', 'produto_7/p',
                          'produto_8/p', 'produto_9/p'),
         '/produto_3/p': ('Pagina Produto 3', 'Produto 3', 'produto_10/p',
                          'produto_11/p', 'produto_12/p')
     }
     with patch('crawler.get_page_contents',
                MockPageGenerator(mock_params)):
         crawler.main(['-d', '1', '-o', 'teste.csv', '/produto_inicial/p'])
     expected = [[
         'Produto Inicial', 'Pagina Inicial',
         'http://www.epocacosmeticos.com.br/produto_inicial/p'
     ],
                 [
                     'Produto 1', 'Pagina Produto 1',
                     'http://www.epocacosmeticos.com.br/produto_1/p'
                 ],
                 [
                     'Produto 2', 'Pagina Produto 2',
                     'http://www.epocacosmeticos.com.br/produto_2/p'
                 ],
                 [
                     'Produto 3', 'Pagina Produto 3',
                     'http://www.epocacosmeticos.com.br/produto_3/p'
                 ]]
     self.assertEqual(expected, self.load_result_csv())
예제 #2
0
 def test_crawl_mock_pages_mixed_with_repetitions(self):
     mock_params = {
         '/pagina_inicial': ('Pagina Inicial', 'Produto Inicial',
                             'produto_1/p', 'pagina_2', 'produto_3/p'),
         '/produto_1/p': ('Pagina Produto 1', 'Produto 1', 'produto_3/p',
                          'pagina_5', 'pagina_6'),
         '/pagina_2':
         ('Pagina 2', 'Página 2', 'produto_1/p', 'pagina_5', 'produto_3/p'),
         '/produto_3/p': ('Pagina Produto 3', 'Produto 3', 'produto_1/p',
                          'pagina_5', 'pagina_6'),
         '/pagina_5':
         ('Pagina 5', 'Página 2', 'produto_1/p', 'pagina_5', 'produto_3/p'),
         '/pagina_6':
         ('Pagina 6', 'Página 2', 'produto_1/p', 'pagina_5', 'produto_3/p'),
     }
     with patch('crawler.get_page_contents',
                MockPageGenerator(mock_params)):
         crawler.main(['-d', '2', '-o', 'teste.csv', '/pagina_inicial'])
     expected = [[
         'Produto 1', 'Pagina Produto 1',
         'http://www.epocacosmeticos.com.br/produto_1/p'
     ],
                 [
                     'Produto 3', 'Pagina Produto 3',
                     'http://www.epocacosmeticos.com.br/produto_3/p'
                 ]]
     self.assertEqual(expected, self.load_result_csv())
예제 #3
0
 def test_crawl_lady_million(self):
     url = '/lady-million-eau-my-gold-eau-de-toilette-paco-rabanne-perfume-feminino/p'
     crawler.main(['-d', '0', '-o', 'teste.csv', url])
     expected = [[
         'Lady Million Eau my Gold Eau de Toilette Paco Rabanne - Perfume Feminino',
         'Perfume Lady Million Eau my Gold EDT Paco Rabanne Feminino - Época Cosméticos',
         'http://www.epocacosmeticos.com.br' + url
     ]]
     self.assertEqual(expected, self.load_result_csv())
예제 #4
0
 def test_crawl_hypnose(self):
     url = '/hypnose-eau-de-toilette-lancome-perfume-feminino/p'
     crawler.main(['-d', '0', '-o', 'teste.csv', url])
     expected = [[
         'Hypnôse Eau de Toilette Lancôme - Perfume Feminino - 30ml',
         'Hypnôse Lancôme - Perfume Feminino - Época Cosméticos',
         'http://www.epocacosmeticos.com.br' + url
     ]]
     self.assertEqual(expected, self.load_result_csv())
예제 #5
0
파일: __init__.py 프로젝트: tkngch/feedpy
def main():
    args = parse_options()

    if args.crawl or args.serve:
        import crawler
        crawler.main()

    if args.build or args.serve:
        import builder
        builder.main()
예제 #6
0
def main():
    if len(sys.argv) != 2:
        usage()
        sys.exit(1)

    if sys.argv[1] == 'serve':
        server.main()
    elif sys.argv[1] == 'crawl':
        crawler.main()
    else:
        usage()
        sys.exit(1)
예제 #7
0
def main():
    if len(sys.argv) != 2:
        usage()
        sys.exit(1)

    if sys.argv[1] == 'serve':
        server.main()
    elif sys.argv[1] == 'crawl':
        crawler.main()
    else:
        usage()
        sys.exit(1)
예제 #8
0
def test_crawler():
    """Testing index route"""

    crawler.main(test=True)

    with open('repos_test.pickle', 'rb') as f:
        list_of_repos = pickle.load(f)

        assert len(list_of_repos) > 0, 'No repo saved in pickle file!'
        assert list_of_repos[0]['repo_url'] != '', 'Repo URL empty!'
        assert list_of_repos[0]['repo_name'] != '', 'Repo Name empty!'
        assert list_of_repos[0]['stars'] != '', 'Repo has no stars!'
        assert list_of_repos[0]['forks'] != '', 'Repo has no forks!'

        # Teardown
        os.remove('repos_test.pickle')
예제 #9
0
 def test_crawl_mock_pages_no_product(self):
     mock_params = {
         '/pagina_inicial': ('Pagina Inicial', 'Produto Inicial',
                             'pagina_1', 'pagina_2', 'pagina_3'),
         '/pagina_1': ('Pagina Produto 1', 'Produto 1', 'pagina_4',
                       'pagina_5', 'pagina_6'),
         '/pagina_2': ('Pagina Produto 2', 'Produto 2', 'pagina_7',
                       'pagina_8', 'pagina_9'),
         '/pagina_3': ('Pagina Produto 3', 'Produto 3', 'pagina_10',
                       'pagina_11', 'pagina_12')
     }
     with patch('crawler.get_page_contents',
                MockPageGenerator(mock_params)):
         crawler.main(['-d', '1', '-o', 'teste.csv', '/pagina_inicial'])
     expected = []
     self.assertEqual(expected, self.load_result_csv())
예제 #10
0
class Main:
    gazetteer = sys.argv[1]
    dataset = sys.argv[2]
    annotatedEntities = sys.argv[3]
    vocabularyFile = sys.argv[4]

    crawler.main(gazetteer, dataset, annotatedEntities, vocabularyFile)
예제 #11
0
파일: main.py 프로젝트: coil398/rcpepper
def main(url, settings):
    page = crawler.main(url, actions)[0]
    # with open('./html.txt', 'r') as f:
    #    page = f.read()

    scraper = scraping(page, settings)
    scraper.setStatus()
    scraper.displaySettings()
def main():
    i = 1
    while (i <= len(sys.argv) - 1):
        fn = str(sys.argv[i])
        read_and_save(fn)
        i += 1
    # if (os.path.isfile("Output.txt")):
    # 	uniqlines = set(open("Output.txt").readlines())
    # 	fill = open("Output.txt", 'w').writelines(set(uniqlines))

    # Solution without messing with order
    # lines_seen = set() # holds lines already seen
    # outfile = open('Output.txt', "w")
    # for line in open('Output.txt', "r"):
    #     if line not in lines_seen: # not a duplicate
    #         outfile.write(line)
    #         lines_seen.add(line)
    # outfile.close()
    crawler.main()
예제 #13
0
def scanner():
    form = Scanner()
    if form.validate_on_submit():
        flash('Scanning URL="%s"' % (form.seed_url.data))
        o = urlparse(form.seed_url.data)
        if o.scheme == 'http' or o.scheme == 'https':
            flash('Valid URL !')
            obj = main(form.seed_url.data)
            #XSS_Module(form.seed_url.data,obj)
            SQL_Module(form.seed_url.data, obj)
        else:
            flash('Invalid URL!')
    return render_template('scanner.html', title='Scanner', form=form)
예제 #14
0
def scanner():
    form = Scanner()
    if form.validate_on_submit():
        flash('Scanning URL="%s"' % (form.seed_url.data))
        o = urlparse(form.seed_url.data)
        if o.scheme == "http" or o.scheme == "https":
            flash("Valid URL !")
            obj = main(form.seed_url.data)
            # XSS_Module(form.seed_url.data,obj)
            SQL_Module(form.seed_url.data, obj)
        else:
            flash("Invalid URL!")
    return render_template("scanner.html", title="Scanner", form=form)
예제 #15
0
파일: views.py 프로젝트: varunjammula/VSVBP
def scanner():
    form = Scanner()
    if form.validate_on_submit():
        flash('Seed URL="%s"' % (form.seed_url.data))
        o = urlparse(form.seed_url.data)
        if o.scheme == 'http' or o.scheme == 'https':
            option = form.example1.data
            obj = main(form.seed_url.data)
            flash("Total # urls found: " + str(len(obj.getUrlList())))
            if len(option) == 2:
                SQL_Module(obj)
                XSS_Module(obj)
            elif len(option) == 1:
                if option[0] == 'XSS':
                    XSS_Module(obj)
                elif option[0] == 'SQL':
                    SQL_Module(obj)

        else:
            flash('Invalid URL!')
    return render_template('scanner.html', title='Scanner', form=form)
예제 #16
0
파일: views.py 프로젝트: varunjammula/VSVBP
def scanner():
    form = Scanner()
    if form.validate_on_submit():
        flash('Seed URL="%s"' %(form.seed_url.data))
        o = urlparse(form.seed_url.data)
        if o.scheme=='http' or o.scheme=='https':
          option=form.example1.data
          obj=main(form.seed_url.data)
          flash("Total # urls found: " + str(len(obj.getUrlList())))
          if len(option)==2:
              SQL_Module(obj)
              XSS_Module(obj)
          elif len(option)==1:
              if option[0]=='XSS':
                 XSS_Module(obj) 
              elif option[0]=='SQL':
                  SQL_Module(obj)
                  
        else :
          flash('Invalid URL!');  
    return render_template('scanner.html',
                           title='Scanner',
                           form=form)
예제 #17
0
def crawl():
    global a
    global b
    global t
    a, b, t = crawler.main()
    return jsonify(rows=b, time=t)
예제 #18
0
 def test_crawl_doubled_id_page(self):
     url = '/mascara-reestruturadora-monoi-e-argan-nick-vick-mascara-para-cabelos-quimicamente-tratados/p'
     crawler.main(['-d', '2', '-o', 'teste.csv', url])
     self.assertLessEqual(1800, len(self.load_result_csv()))
예제 #19
0
 def test_crawl_malformed_url(self):
     url = '/cabelos/coloracao/tintura-para-cabelos/Sem Amônia'
     crawler.main(['-d', '0', '-o', 'teste.csv', url])
     self.assertEqual(0, len(self.load_result_csv()))
예제 #20
0
    temp_pd = pandas.read_csv("pid.csv")
    n = temp_pd.shape[0]

    while True:

        time_start = time.time()

        # 建立代理池
        ippool = buildip.buildippool()
        #ippool=[{}] # 测试专用行

        print('*************************开始爬取%s个地点的微博*********************' %
              str(n))
        #建立进程
        for i in range(n):
            crawler.main(i, ippool, yag, emailname)

        time_end = time.time()
        print(' time cost ', time_end - time_start, 's')

        print('***********************休息三小时再继续爬********************')
        conn = sqlite3.connect('weibo.sqlite')
        weibo_pd = pandas.read_sql_query("SELECT * FROM weibo", conn)
        wb_detail = weibo_pd['place'].value_counts().to_dict()
        wb_m = weibo_pd.shape[0]
        pic_pd = pandas.read_sql_query("SELECT * FROM pic", conn)
        pic_m = pic_pd.shape[0]
        conn.close()
        yag.send(
            to=[emailname],
            subject='All Done',
예제 #21
0
# coding:utf-8
# version:python 3.7
# author:Ivy

import crawler


############## 自主设置区 #############################
cookie='' # 你自己的cookie
mid='' # 需要爬取的微博id
type='' # repost还是comment
######################################################

result=crawler.main(mid,type,cookie)
if result==0:
    print('type输入错了,修改后重新运行!')
else:
    print('爬完了')
예제 #22
0
def setup_window():
    global window
    # Main window
    window.title('spidy Web Crawler - by rivermont')
    window.iconbitmap('{0}\\media\\favicon.ico'.format(CRAWLER_DIR))

    overwrite = BooleanVar()
    raise_errors = BooleanVar()
    save_pages = BooleanVar()
    zip_files_ = BooleanVar()
    save_words = BooleanVar()
    # todo_file = StringVar()
    # done_file = StringVar()
    # bad_file = StringVar()
    # word_file = StringVar()
    save_count = IntVar()
    max_new_errors = IntVar()
    max_http_errors = IntVar()
    max_known_errors = IntVar()
    max_new_mimes = IntVar()
    # custom_headers = StringVar()

    # Frame to fill main window
    main_frame = ttk.Frame(window, padding='4')
    main_frame.grid(column=0, row=0, sticky=(N, W, E, S))
    main_frame.columnconfigure(0, weight=1)
    main_frame.rowconfigure(0, weight=1)

    # Container to hold variable settings
    setting_box = ttk.Frame(main_frame,
                            padding='4',
                            borderwidth=1,
                            relief='solid')
    setting_box.grid(column=0, row=0, sticky=(N, S, W))
    setting_box.columnconfigure(0, weight=1)
    setting_box.rowconfigure(0, weight=1)

    # Container for things on the right side of the main window
    right_bar = ttk.Frame(main_frame,
                          padding='4',
                          borderwidth=1,
                          relief='solid')
    right_bar.grid(column=1, row=0, sticky=(N, S, E))
    right_bar.columnconfigure(2, weight=1)
    right_bar.rowconfigure(0, weight=1)

    # Container for controlling the crawler
    control_box = ttk.Frame(right_bar,
                            padding='4',
                            borderwidth=1,
                            relief='solid')
    control_box.grid(column=1, row=0, sticky=(N, E, W))
    control_box.columnconfigure(1, weight=1)
    control_box.rowconfigure(0, weight=1)

    # Container for the status elements
    status_box = ttk.Frame(right_bar,
                           padding='4',
                           borderwidth=1,
                           relief='solid')
    status_box.grid(column=0, row=1, sticky=(E, W))
    status_box.columnconfigure(0, weight=1)
    status_box.rowconfigure(1, weight=1)

    # Container for the console log
    console_box = ttk.Frame(right_bar,
                            padding='4',
                            borderwidth=1,
                            relief='solid')
    console_box.grid(column=0, row=2)
    console_box.columnconfigure(0, weight=1)
    console_box.rowconfigure(2, weight=1)

    # Button to pause the crawler
    pause_button = ttk.Button(control_box, padding='4', text='Pause')
    pause_button.grid(column=0, row=0, sticky=(N, S, W))
    pause_button.columnconfigure(0, weight=1)
    pause_button.rowconfigure(0, weight=1)

    # Button to start the crawler
    go_button = ttk.Button(control_box, command=main(), padding='4', text='Go')
    go_button.grid(column=1, row=0, sticky=(N, S))
    go_button.columnconfigure(1, weight=1)
    go_button.rowconfigure(0, weight=1)

    # Button to stop the crawler
    stop_button = ttk.Button(control_box, padding='4', text='Stop')
    stop_button.grid(column=2, row=0, sticky=(N, S, E))
    stop_button.columnconfigure(2, weight=1)
    stop_button.rowconfigure(0, weight=1)

    # Title for crawler setting area
    ttk.Label(setting_box, text='Crawler Settings').grid(column=0,
                                                         row=0,
                                                         columnspan=4,
                                                         sticky=(N, S))

    # Option to set Overwrite
    overwrite_check = ttk.Checkbutton(setting_box,
                                      text='Overwrite',
                                      variable=overwrite)
    overwrite_check.grid(column=0, row=1, columnspan=2, sticky=W)
    overwrite_check.columnconfigure(0, weight=1)
    overwrite_check.rowconfigure(1, weight=1)

    # Option to set RaiseErrors
    raise_errors_check = ttk.Checkbutton(setting_box,
                                         text='Raise Errors',
                                         variable=raise_errors)
    raise_errors_check.grid(column=0, row=2, columnspan=2, sticky=W)
    raise_errors_check.columnconfigure(0, weight=1)
    raise_errors_check.rowconfigure(2, weight=1)

    # Option to set SavePages
    save_pages_check = ttk.Checkbutton(setting_box,
                                       text='Save Pages',
                                       variable=save_pages)
    save_pages_check.grid(column=0, row=3, columnspan=2, sticky=W)
    save_pages_check.columnconfigure(0, weight=1)
    save_pages_check.rowconfigure(3, weight=1)

    # Option to set ZipFiles
    zip_files_check = ttk.Checkbutton(setting_box,
                                      text='Zip Files',
                                      variable=zip_files_)
    zip_files_check.grid(column=0, row=4, columnspan=2, sticky=W)
    zip_files_check.columnconfigure(0, weight=1)
    zip_files_check.rowconfigure(4, weight=1)

    # Option to set SaveWords
    save_words_check = ttk.Checkbutton(setting_box,
                                       text='Save Words',
                                       variable=save_words)
    save_words_check.grid(column=0, row=5, columnspan=2, sticky=W)
    save_words_check.columnconfigure(0, weight=1)
    save_words_check.rowconfigure(5, weight=1)

    # Field to enter number for SaveCount
    ttk.Label(setting_box, text='Save Count').grid(column=0,
                                                   row=6,
                                                   columnspan=2,
                                                   sticky=W)

    save_count_entry = ttk.Entry(setting_box, width=5, textvariable=save_count)
    save_count_entry.grid(column=0, row=7, sticky=(E, W))
    save_count_entry.columnconfigure(0, weight=1)
    save_count_entry.rowconfigure(7, weight=1)

    # Field to enter custom headers
    ttk.Label(setting_box, text='Custom Headers').grid(column=0,
                                                       row=8,
                                                       columnspan=2,
                                                       sticky=W)

    custom_headers_entry = Text(setting_box, height=3, width=16)
    custom_headers_entry.grid(column=0, row=9, columnspan=2, sticky=W)
    custom_headers_entry.columnconfigure(0, weight=1)
    custom_headers_entry.rowconfigure(9, weight=1)

    # Field to enter custom starting links
    ttk.Label(setting_box, text='Start Links').grid(column=0,
                                                    row=10,
                                                    columnspan=2,
                                                    sticky=W)

    custom_start_links = Text(setting_box, height=2, width=16)
    custom_start_links.grid(column=0, row=11, columnspan=2, sticky=W)
    custom_start_links.columnconfigure(0, weight=1)
    custom_start_links.rowconfigure(11, weight=1)

    # Button to select todo file
    get_todo_file_button = ttk.Button(setting_box,
                                      text='...',
                                      command=get_file)
    get_todo_file_button.grid(column=2, row=1, sticky=W)
    get_todo_file_button.columnconfigure(1, weight=1)
    get_todo_file_button.rowconfigure(2, weight=1)

    ttk.Label(setting_box, text='TODO File').grid(column=3, row=1, sticky=W)

    # Button to select done file
    get_done_file_button = ttk.Button(setting_box,
                                      text='...',
                                      command=get_file)
    get_done_file_button.grid(column=2, row=2, sticky=W)
    get_done_file_button.columnconfigure(2, weight=1)
    get_done_file_button.rowconfigure(2, weight=1)

    ttk.Label(setting_box, text='Done File').grid(column=3, row=2, sticky=W)

    # Button to select bad link file
    get_bad_file_button = ttk.Button(setting_box, text='...', command=get_file)
    get_bad_file_button.grid(column=2, row=3, sticky=W)
    get_bad_file_button.columnconfigure(2, weight=1)
    get_bad_file_button.rowconfigure(3, weight=1)

    ttk.Label(setting_box, text='Bad Link File').grid(column=3,
                                                      row=3,
                                                      sticky=W)

    # Button to select word file
    get_word_file_button = ttk.Button(setting_box,
                                      text='...',
                                      command=get_file)
    get_word_file_button.grid(column=2, row=4, sticky=W)
    get_word_file_button.columnconfigure(2, weight=1)
    get_word_file_button.rowconfigure(4, weight=1)

    ttk.Label(setting_box, text='Word File').grid(column=3, row=4, sticky=W)

    # Field to set MaxNewErrors
    max_new_error_entry = ttk.Entry(setting_box,
                                    width=4,
                                    textvariable=max_new_errors)
    max_new_error_entry.grid(column=2, row=5, sticky=(E, W))
    max_new_error_entry.columnconfigure(2, weight=1)
    max_new_error_entry.rowconfigure(5, weight=1)

    ttk.Label(setting_box, text='Max New Errors').grid(column=3,
                                                       row=5,
                                                       sticky=W)

    # Field to set MaxHTTPErrors
    max_http_error_entry = ttk.Entry(setting_box,
                                     width=4,
                                     textvariable=max_http_errors)
    max_http_error_entry.grid(column=2, row=6, sticky=(E, W))
    max_http_error_entry.columnconfigure(2, weight=1)
    max_http_error_entry.rowconfigure(6, weight=1)

    ttk.Label(setting_box, text='Max HTTP Errors').grid(column=3,
                                                        row=6,
                                                        sticky=W)

    # Field to set MaxKnownErrors
    max_known_errors_entry = ttk.Entry(setting_box,
                                       width=4,
                                       textvariable=max_known_errors)
    max_known_errors_entry.grid(column=2, row=7, sticky=(E, W))
    max_known_errors_entry.columnconfigure(2, weight=1)
    max_known_errors_entry.rowconfigure(7, weight=1)

    ttk.Label(setting_box, text='Max Known Errors').grid(column=3,
                                                         row=7,
                                                         sticky=W)

    # Field to set MaxNewMIMEs
    max_new_mimes_entry = ttk.Entry(setting_box,
                                    width=4,
                                    textvariable=max_new_mimes)
    max_new_mimes_entry.grid(column=2, row=8, sticky=(E, W))
    max_new_mimes_entry.columnconfigure(2, weight=1)
    max_new_mimes_entry.rowconfigure(8, weight=1)

    ttk.Label(setting_box, text='Max New MIMEs').grid(column=3,
                                                      row=8,
                                                      sticky=W)
예제 #23
0
def GET():
        main(e.get(),e2.get())
예제 #24
0
 def test_crawl_mock_pages_with_persistence(self):
     mock_params = {
         '/pagina_inicial': ('Pagina Inicial', 'Produto Inicial',
                             'produto_1/p', 'pagina_2', 'produto_3/p'),
         '/produto_1/p': ('Pagina Produto 1', 'Produto 1', 'produto_3/p',
                          'pagina_5', 'pagina_6'),
         '/produto_2/p': ('Pagina Produto 2', 'Produto 2', 'produto_4/p',
                          'produto_5/p', 'pagina_1'),
         '/produto_3/p': ('Pagina Produto 3', 'Produto 3', 'produto_2/p',
                          'produto_4/p', 'produto_6/p'),
         '/produto_4/p': ('Pagina Produto 4', 'Produto 4', 'pagina_inicial',
                          'pagina_6', 'produto_5/p'),
         '/produto_5/p': ('Pagina Produto 5', 'Produto 5', 'produto_7/p',
                          'produto_8/p', 'pagina_5'),
         '/produto_6/p': ('Pagina Produto 6', 'Produto 6', 'pagina_inicial',
                          'pagina_1', 'produto_5/p'),
         '/produto_7/p': ('Pagina Produto 6', 'Produto 6', 'pagina_inicial',
                          'pagina_1', 'produto_5/p'),
         '/pagina_1':
         ('Pagina 1', 'Página 1', 'produto_1/p', 'pagina_5', 'produto_3/p'),
         '/pagina_2':
         ('Pagina 2', 'Página 2', 'produto_2/p', 'pagina_1', 'pagina_3'),
         '/pagina_3': ('Pagina 3', 'Página 3', 'produto_6/p', 'pagina_1',
                       'pagina_inicial'),
         '/pagina_4':
         ('Pagina 4', 'Página 4', 'produto_8/p', 'pagina_5', 'produto_3/p'),
         '/pagina_5':
         ('Pagina 5', 'Página 5', 'produto_2/p', 'pagina_4', 'produto_4/p'),
         '/pagina_6': ('Pagina 6', 'Página 6', 'pagina_inicial', 'pagina_5',
                       'produto_3/p'),
     }
     # with patch('crawler.get_page_contents', MockPageGenerator(mock_params)):
     #     crawler.main(['-d', '0', '-o', 'teste.csv', '-r', 'teste.json', '/pagina_inicial'])
     # expected = []
     # self.assertEqual(expected, self.load_result_csv())
     # with patch('crawler.get_page_contents', MockPageGenerator(mock_params)):
     #     crawler.main(['-d', '0', '-o', 'teste.csv', '-r', 'teste.json', '/pagina_inicial'])
     # expected = [['Produto 1', 'Pagina Produto 1', 'http://www.epocacosmeticos.com.br/produto_1/p'],
     #             ['Produto 3', 'Pagina Produto 3', 'http://www.epocacosmeticos.com.br/produto_3/p']]
     # self.assertEqual(expected, self.load_result_csv())
     with patch('crawler.get_page_contents',
                MockPageGenerator(mock_params)):
         crawler.main([
             '-d', '2', '-o', 'teste.csv', '-r', 'teste.json',
             '/pagina_inicial'
         ])
     expected = [[
         'Produto 1', 'Pagina Produto 1',
         'http://www.epocacosmeticos.com.br/produto_1/p'
     ],
                 [
                     'Produto 3', 'Pagina Produto 3',
                     'http://www.epocacosmeticos.com.br/produto_3/p'
                 ],
                 [
                     'Produto 2', 'Pagina Produto 2',
                     'http://www.epocacosmeticos.com.br/produto_2/p'
                 ],
                 [
                     'Produto 4', 'Pagina Produto 4',
                     'http://www.epocacosmeticos.com.br/produto_4/p'
                 ],
                 [
                     'Produto 6', 'Pagina Produto 6',
                     'http://www.epocacosmeticos.com.br/produto_6/p'
                 ]]
     self.assertEqual(expected, self.load_result_csv())
예제 #25
0
def main(title: str, skip_crawling: bool):
    title = str(title)
    if (not skip_crawling):
        crawler.main(title)
    print("Start to create video for {}".format(title))
    fps = config['animation_fps']
    width = config['width']
    height = config['height']
    test = config['test']

    # Paths
    output_dir = os.sep.join([".", "output"])
    if not os.path.exists(output_dir):
        print("Folder", output_dir, 'does not exist. Creating...')
        os.makedirs(output_dir)
    resource_dir = os.sep.join([".", "resource", title])

    # Assets
    result = text_processing.load_data(title)
    title_font = ImageFont.truetype(config['title_font'],
                                    config['title_font_size'],
                                    encoding="utf-8")
    content_font = ImageFont.truetype(config['content_font'],
                                      config['content_font_size'],
                                      encoding="utf-8")
    title_wrapper = text_processing.Wrapper(title_font)
    content_wrapper = text_processing.Wrapper(content_font)
    audio_clip = AudioFileClip(
        os.sep.join([".", "resource", title, "audio", title + ".mp3"]))

    if not os.path.exists(output_dir):
        print("Folder", output_dir, 'does not exist. Creating...')
        os.makedirs(output_dir)

    keys = list(map(int, result.keys()))
    if 0 not in keys:
        keys.append(0)
    keys.append(math.ceil(audio_clip.duration))
    keys.sort()
    #print(keys)
    video_clips = []

    key_length = 10 if test else len(keys) - 1

    files = os.listdir(os.sep.join(['.', 'resource', title]))
    print(files)

    for i in range(0, key_length):
        key = str(keys[i])
        start = keys[i]
        end = keys[i + 1]
        #image_dir = os.sep.join(['.', 'resource', key+result[key]['image_suffix']])
        if ((key not in result.keys())
                or (key + result[key]['image_suffix'] not in files)):
            print("Case1")
            if key == '0':
                print("Creating title...")
                frame = image_processing.generate_title_image(
                    os.sep.join(['.', 'resource', title, 'title.jpg']),
                    (width, height))
            else:
                frame = image_processing.generate_blank_frame(
                    "", "", (width, height), title_wrapper, content_wrapper,
                    title_font, content_font)
            videoclip = video_processing.create_video_with_frame(
                frame, start, end)
            video_clips.append(videoclip)
        else:
            if (result[key]['image_suffix'].lower() not in [".gif"]):
                print("Case2")
                image = os.sep.join([
                    '.', 'resource', title,
                    str(key) + result[key]['image_suffix']
                ])
                header = result[key]['header']
                content = result[key]['content']
                frame = image_processing.generate_frame(
                    image, header, content, (width, height), title_wrapper,
                    content_wrapper, title_font, content_font)
                videoclip = video_processing.create_video_with_frame(
                    frame, start, end)
                video_clips.append(videoclip)
                #os.remove(image)
            elif (result[key]['image_suffix'].lower() in [".gif"]):
                print("Case3")
                image = os.sep.join([
                    '.', 'resource', title,
                    str(key) + result[key]['image_suffix']
                ])
                print(image)
                header = result[key]['header']
                content = result[key]['content']
                if config['skip_gif']:
                    background_frame = image_processing.generate_blank_frame(
                        header, content, (width, height), title_wrapper,
                        content_wrapper, title_font, content_font)
                    videoclip = video_processing.create_video_with_frame(
                        background_frame, start, end)
                else:
                    gif_clip = video_processing.load_gif_clip(image)
                    background_frame = image_processing.generate_blank_frame(
                        header, content, (width, height), title_wrapper,
                        content_wrapper, title_font, content_font)
                    videoclip = video_processing.create_video_with_gif_clip(
                        background_frame, gif_clip, start, end)
                video_clips.append(videoclip)

    merged_clips = concatenate_videoclips(video_clips)
    merged_clips.audio = audio_clip
    logo_clip = video_processing.load_logo(os.sep.join(
        [".", "util", config['logo_name']]),
                                           duration=merged_clips.duration)
    if config['enable_logo']:
        final_clip = video_processing.add_logo(merged_clips, logo_clip)
    else:
        final_clip = merged_clips
    if test:
        final_clip = video_processing.add_logo(
            merged_clips, logo_clip).subclip(0, min(50, final_clip.duration))
    final_clip.write_videofile(os.sep.join(
        [".", "output", title + "_animated.mp4"]),
                               fps=fps,
                               threads=4)
    print(title, "finished!")
예제 #26
0
파일: __main__.py 프로젝트: Kozek/Aran
import crawler

if __name__ == '__main__':
    crawler.main()
예제 #27
0
def job():
    print('启动定时执行')
    crawler.main()
예제 #28
0
# nohup python {{path}} &
from flask import Flask, Response, jsonify, render_template

import crawler

app = Flask(__name__)

a, b, t = crawler.main()

@app.route("/")
def index():
    return render_template('index.html', title="hkepc")

@app.route("/crawl")
def crawl():
    global a
    global b
    global t
    a, b, t = crawler.main()
    return jsonify(rows=b, time=t)

@app.route("/api/get")
def getData():
    return jsonify(rows=b, time=t)

@app.route("/json")
def jsontest():
    list = [
        {'param': 'foo', 'val': 2},
        {'param': 'bar', 'val': 10}
    ]
예제 #29
0
def main():
    import crawler
    crawler.common.DOWNLOAD_THREADS = 10
    crawler.main()
예제 #30
0
파일: main.py 프로젝트: seo3650/KDJ_project
# -*- coding: utf-8 -*-
import notification.notification as notification
import score.score as score
import crawler as crawler
from time import sleep
import re

option = notification.main("initial", "")

while True:

    print("option: " + option)
    contents = crawler.main()
    print(contents)
    for article in contents:
        signal = score.score_with_word(article[0], article[1], option)
        if signal:
            notification.main("notify", article[0])
    sleep(24*60*60)
예제 #31
0
 def test_crawl_eternity_product_link_not_found(self):
     url = '/eternity-25th-anniversary-edition-for-women-eau-de-toilette-calvin-klein-perfume-feminino/p'
     crawler.main(['-d', '0', '-o', 'teste.csv', url])
     expected = []
     self.assertEqual(expected, self.load_result_csv())
예제 #32
0
def main(title:str, skip_crawling:bool):
    title=str(title)
    if(not skip_crawling):
        crawler.main(title)
    print("Start to create video for {}".format(title))
    fps = config['fps']
    width = config['width']
    height = config['height']

    # Paths
    output_dir = os.sep.join([".", "output"])
    if not os.path.exists(output_dir):
        print("Folder", output_dir, 'does not exist. Creating...')
        os.makedirs(output_dir)
    resource_dir = os.sep.join([".", "resource", title])

    # Assets
    result = text_processing.load_data(title)
    title_font = ImageFont.truetype(config['title_font'], config['title_font_size'], encoding="utf-8")
    content_font = ImageFont.truetype(config['content_font'], config['content_font_size'], encoding="utf-8") 
    title_wrapper = text_processing.Wrapper(title_font)
    content_wrapper = text_processing.Wrapper(content_font)

    
    # Video Properties
    fourcc = VideoWriter_fourcc(*'mp4v')
    video = VideoWriter(os.sep.join([output_dir, title+'_simple.mp4']), fourcc, float(fps), (width, height))

    # Create Video
    keys = list(map(int, result.keys()))
    if 0 not in keys:
        keys.append(0)
        frame = image_processing.create_blank_frame("", "", (width, height), title_wrapper, content_wrapper, title_font, content_font)
    else:
        key = "0"
        image = os.sep.join([resource_dir, str(key)+result[key]['image_suffix']])
        header = result[key]['header']
        content = result[key]['content']
        print("标题:{}".format(header))
        if(result[key]['image_suffix'] in ['.gif', '.GIF']):
            frame = image_processing.create_blank_frame(header, content, (width, height), title_wrapper, content_wrapper, title_font, content_font)
        else:
            frame = image_processing.create_frame(image, header, content, (width, height), title_wrapper, content_wrapper, title_font, content_font)
            #os.remove(image)

    keys.sort()
    # Set last picture to be 20 seconds long
    keys.append(keys[len(keys)-1]+20)
    #print(keys)
    # Number of frames in this video
    total_length = keys[len(keys)-1]*fps

    index = 0
    for i in range(total_length):
        if(index+1>len(keys)-1):
            frame = image_processing.create_blank_frame("","", (width, height), title_wrapper, content_wrapper, title_font, content_font)
        elif (i/fps)>keys[index+1]:
            index+=1
            print("Processing {} frames out of {}".format(index, len(keys)-1))
            key = str(keys[index])
            image = os.sep.join([resource_dir, str(key)+result[key]['image_suffix']])
            header = result[key]['header']
            content = result[key]['content']
            print("标题:{}".format(header))
            if(result[key]['image_suffix'] in ['.gif', '.GIF']):
                frame = image_processing.create_blank_frame(header, content, (width, height), title_wrapper, content_wrapper, title_font, content_font)
            else:
                frame = image_processing.create_frame(image, header, content, (width, height), title_wrapper, content_wrapper, title_font, content_font)
                #os.remove(image)
        else:
            pass
        video.write(frame)
    print("{} finished!".format(title))
예제 #33
0
 def test_crawl_invalid_product(self):
     url = '/invalid-product/p'
     crawler.main(['-d', '0', '-o', 'teste.csv', url])
     expected = []
     self.assertEqual(expected, self.load_result_csv())
예제 #34
0
파일: daemon.py 프로젝트: akkakks/pycrawler
def main():
    import crawler
    crawler.common.DOWNLOAD_THREADS = 10
    crawler.main()
예제 #35
0
 def test_crawl_home_page_depth_0(self):
     crawler.main(['-d', '0', '-o', 'teste.csv', '/'])
     expected = []
     self.assertEqual(expected, self.load_result_csv())
예제 #36
0
def main(title: str, skip_crawling: bool):
    title = str(title)
    if (not skip_crawling):
        crawler.main(title)
    print("Start to create video for {}".format(title))
    fps = config['fps']
    width = config['width']
    height = config['height']

    # Paths
    output_dir = os.sep.join([".", "output"])
    if not os.path.exists(output_dir):
        print("Folder", output_dir, 'does not exist. Creating...')
        os.makedirs(output_dir)
    resource_dir = os.sep.join([".", "resource", title])

    # Assets
    result = text_processing.load_data(title)
    title_font = ImageFont.truetype(config['title_font'],
                                    config['title_font_size'],
                                    encoding="utf-8")
    content_font = ImageFont.truetype(config['content_font'],
                                      config['content_font_size'],
                                      encoding="utf-8")
    title_wrapper = text_processing.Wrapper(title_font)
    content_wrapper = text_processing.Wrapper(content_font)
    audio_clip = AudioFileClip(
        os.sep.join([resource_dir, "audio", title + ".mp3"]))

    # Video Properties
    fourcc = VideoWriter_fourcc(*'mp4v')
    video = VideoWriter(os.sep.join([output_dir, title + '_complex_temp.mp4']),
                        fourcc, float(fps), (width, height))

    # Create Video
    keys = list(map(int, result.keys()))
    if 0 not in keys:
        keys.append(0)
        frame = image_processing.generate_cv2_title_image(
            os.sep.join(['.', 'resource', title, 'title.jpg']),
            (width, height))
    else:
        key = "0"
        image = os.sep.join(
            [resource_dir,
             str(key) + result[key]['image_suffix']])
        header = result[key]['header']
        content = result[key]['content']
        print("标题:{}".format(header))
        if (result[key]['image_suffix'] in ['.gif', '.GIF']):
            frame = image_processing.generate_cv2_blank_frame(
                header, content, (width, height), title_wrapper,
                content_wrapper, title_font, content_font)
        else:
            frame = image_processing.generate_cv2_frame(
                image, header, content, (width, height), title_wrapper,
                content_wrapper, title_font, content_font)
            #os.remove(image)

    keys.sort()
    # Set last picture to be 20 seconds long
    keys.append(math.ceil(audio_clip.duration))
    #print(keys)
    # Number of frames in this video
    total_length = (200 if config['test'] else keys[len(keys) - 1]) * fps

    index = 0
    for i in range(total_length):
        if (index > len(keys) - 1):
            frame = image_processing.generate_cv2_blank_frame(
                "", "", (width, height), title_wrapper, content_wrapper,
                title_font, content_font)
        elif (i / fps) >= keys[index + 1]:
            index += 1
            print("Processing {} frames out of {}".format(
                index,
                len(keys) - 1))
            key = str(keys[index])
            image = os.sep.join(
                [resource_dir,
                 str(key) + result[key]['image_suffix']])
            header = result[key]['header']
            content = result[key]['content']
            print("标题:{}".format(header))
            if (result[key]['image_suffix'] in ['.gif', '.GIF']):
                frame = image_processing.generate_cv2_blank_frame(
                    header, content, (width, height), title_wrapper,
                    content_wrapper, title_font, content_font)
            else:
                frame = image_processing.generate_cv2_frame(
                    image, header, content, (width, height), title_wrapper,
                    content_wrapper, title_font, content_font)
                #os.remove(image)
        else:
            pass
        video.write(frame)
    video.release()
    video_clip = VideoFileClip(
        os.sep.join([output_dir, title + "_complex_temp.mp4"]))
    print(video_clip.duration)
    video_clip.audio = audio_clip
    if config['enable_logo']:
        logo_clip = video_processing.load_logo(os.sep.join(
            [".", "util", config['logo_name']]),
                                               duration=video_clip.duration)
        video_clip = video_processing.add_logo(video_clip, logo_clip)
    if config['test']:
        video_clip = video_clip.subclip(0, min(200, video_clip.duration))
    video_clip.write_videofile(os.sep.join(
        [output_dir, title + "_complex.mp4"]),
                               fps=fps)
    print("{} finished!".format(title))
    os.remove(os.sep.join([output_dir, title + "_complex_temp.mp4"]))
예제 #37
0
 def test_crawl_home_page_depth_1(self):
     crawler.main(['-d', '1', '-o', 'teste.csv', '/'])
     self.assertLessEqual(70, len(self.load_result_csv()))