def construct_css(content,subdirectory,date,id_doc_xml,lg) : 
  list_css_filename = create_css_list(content)
  (dd,mm,yyyy) = date
  filename = '%d-%0.2d-%0.2d_celex_%s.%s.css'%(int(yyyy),int(mm),int(dd),id_doc_xml,lg)  
  path_css = os.path.join(subdirectory,filename)
  f = open(path_css,'w')
  style = get_style_balise_content(content)
  print >>f, style
  for url_css in list_css_filename :
    content_css = tools.url2content(url_css)
    print >>f, content_css
  f.close()
  return filename
def crawl(options):
    list_lg = options.languages
    start_date, end_date = starting_ending_date(options.period)
    directory = options.output_dir
    nb_pages = init_crawl(start_date, end_date, list_lg)

    cpt_done = 0
    for nb in xrange(1, nb_pages + 1):
        url = construct_urlpage(start_date, end_date, list_lg, nb)
        content = tools.url2content(url)
        list_entry = get_entry(content)
        for e in list_entry:
            cpt_done += 1
            cpt_done = check_sleep(cpt_done)
            (code_url, code_document) = e['ip']
            for lg in list_lg:
                if lg not in e['lg']:
                    continue
                if options.verbose:
                    print '/'.join(e['date']), code_document, lg
                url = e['lg'][lg]
                content = tools.url2content(url)
                process(content, code_url, directory, lg, e['date'], options)
def crawl(options) :
  list_lg = options.languages
  start_date, end_date = starting_ending_date(options.period)
  directory = options.output_dir
  nb_pages = init_crawl(start_date, end_date, list_lg)

  cpt_done = 0
  for nb in xrange(1, nb_pages+1) :
    url = construct_urlpage(start_date, end_date, list_lg, nb)
    content = tools.url2content(url)
    list_entry = get_entry(content)
    for e in list_entry :
      cpt_done += 1
      cpt_done = check_sleep(cpt_done)
      (code_url, code_document) = e['ip']
      for lg in list_lg :
        if lg not in e['lg'] :
          continue
        if options.verbose :
          print '/'.join(e['date']), code_document, lg 
        url = e['lg'][lg]
        content = tools.url2content(url)          
        process(content, code_url, directory, lg, e['date'], options)
Exemplo n.º 4
0
def construct_css(content, subdirectory, date, id_doc_xml, lg):
    list_css_filename = create_css_list(content)
    (dd, mm, yyyy) = date
    filename = '%d-%0.2d-%0.2d_celex_%s.%s.css' % (int(yyyy), int(mm), int(dd),
                                                   id_doc_xml, lg)
    path_css = os.path.join(subdirectory, filename)
    f = open(path_css, 'w')
    style = get_style_balise_content(content)
    print >> f, style
    for url_css in list_css_filename:
        content_css = tools.url2content(url_css)
        print >> f, content_css
    f.close()
    return filename
def init_crawl(start_date, end_date, list_lg):
    url = construct_urlpage(start_date, end_date, list_lg, 1)
    content = tools.url2content(url)
    nb_pages = compute_nbpage(content)
    return int(nb_pages)
def init_crawl(start_date, end_date, list_lg) :
  url = construct_urlpage(start_date, end_date, list_lg, 1)
  content = tools.url2content(url)
  nb_pages = compute_nbpage(content)
  return int(nb_pages)