예제 #1
0
def get_registraduria_with_params():
    '''
  combines all the params in the given dict `param`
  scrapes all the urls with those params in the given output_folder (scrape_data_folder)

  then the files in  scrape_data_folder are parsed via SampleHTMLParser which counts the anchors & paragraphs
  in each file, and outputs the results to another folder.
  '''
    scrape_data_folder = "registraduria_raw_data"
    extracted_data_folder = "registraduria_extracted_data"

    params = {
        "objeto": [
            "10000000", "11000000", "12000000", "15000000", "13000000",
            "14000000", "27000000", "20000000", "21000000"
        ],
        "paginaObjetivo": ["1"],
        "cuantias": ["1", "2", "3"]
    }
    url = "https://www.contratos.gov.co/consultas/resultadosConsulta.do?&ctl00$ContentPlaceHolder1$hidIDProducto=-1&ctl00$ContentPlaceHolder1$hidRedir=&departamento=&ctl00$ContentPlaceHolder1$hidNombreDemandante=-1&ctl00$ContentPlaceHolder1$hidNombreProducto=-1&fechaInicial=&ctl00$ContentPlaceHolder1$hidIdEmpresaC=0&ctl00$ContentPlaceHolder1$hidIdOrgV=-1&ctl00$ContentPlaceHolder1$hidIDProductoNoIngresado=-1&ctl00$ContentPlaceHolder1$hidRangoMaximoFecha=&fechaFinal=&desdeFomulario=true&ctl00$ContentPlaceHolder1$hidIdOrgC=-1&ctl00$ContentPlaceHolder1$hidIDRubro=-1&tipoProceso=&registrosXPagina=10&numeroProceso=&municipio=0&estado=0&ctl00$ContentPlaceHolder1$hidNombreProveedor=-1&ctl00$ContentPlaceHolder1$hidIdEmpresaVenta=-1"

    print("scraping registraduria...")
    scrape(url,
           output_folder=scrape_data_folder,
           params_and_values=params,
           workers=10)
    print("parsing registraduria..")
    parse(input_folder=scrape_data_folder,
          output_folder=extracted_data_folder,
          parser=SampleHTMLParser(),
          workers=8)
예제 #2
0
def busqueda_de_vias():
  params = {
       "id" : ["05","15","17","18","19","27","25","95","41","52","54","63","66","68","73","76"]
  }
  url = "https://monitoreoinvias.com/publico-departamento.php"
  scrape(url, output_folder=scrape_busqueda_data_folder, params_and_values=params, workers=10)
  parse(input_folder=scrape_busqueda_data_folder, output_folder=extracted_busqueda_data_folder, parser=MonitoreoViasParserSearchResults(), workers=10)
예제 #3
0
def busqueda_de_vias():
    params = {
        "id": [
            "05", "15", "17", "18", "19", "27", "25", "95", "41", "52", "54",
            "63", "66", "68", "73", "76"
        ]
    }
    url = "https://monitoreoinvias.com/publico-departamento.php"
    scrape(url,
           output_folder=scrape_busqueda_data_folder,
           params_and_values=params,
           workers=10)
    parse(input_folder=scrape_busqueda_data_folder,
          output_folder=extracted_busqueda_data_folder,
          parser=MonitoreoViasParserSearchResults(),
          workers=10)
예제 #4
0
파일: example.py 프로젝트: dav009/tarantula
def get_registraduria_with_params():
  '''
  combines all the params in the given dict `param`
  scrapes all the urls with those params in the given output_folder (scrape_data_folder)

  then the files in  scrape_data_folder are parsed via SampleHTMLParser which counts the anchors & paragraphs
  in each file, and outputs the results to another folder.
  '''
  scrape_data_folder = "registraduria_raw_data"
  extracted_data_folder =  "registraduria_extracted_data"

  params = {
       "objeto" : ["10000000", "11000000", "12000000", "15000000", "13000000", "14000000", "27000000", "20000000", "21000000"],
       "paginaObjetivo": ["1"],
       "cuantias": ["1", "2", "3"]
  }
  url = "https://www.contratos.gov.co/consultas/resultadosConsulta.do?&ctl00$ContentPlaceHolder1$hidIDProducto=-1&ctl00$ContentPlaceHolder1$hidRedir=&departamento=&ctl00$ContentPlaceHolder1$hidNombreDemandante=-1&ctl00$ContentPlaceHolder1$hidNombreProducto=-1&fechaInicial=&ctl00$ContentPlaceHolder1$hidIdEmpresaC=0&ctl00$ContentPlaceHolder1$hidIdOrgV=-1&ctl00$ContentPlaceHolder1$hidIDProductoNoIngresado=-1&ctl00$ContentPlaceHolder1$hidRangoMaximoFecha=&fechaFinal=&desdeFomulario=true&ctl00$ContentPlaceHolder1$hidIdOrgC=-1&ctl00$ContentPlaceHolder1$hidIDRubro=-1&tipoProceso=&registrosXPagina=10&numeroProceso=&municipio=0&estado=0&ctl00$ContentPlaceHolder1$hidNombreProveedor=-1&ctl00$ContentPlaceHolder1$hidIdEmpresaVenta=-1"


  print("scraping registraduria...")
  scrape(url, output_folder=scrape_data_folder, params_and_values=params, workers=10)
  print("parsing registraduria..")
  parse(input_folder=scrape_data_folder, output_folder=extracted_data_folder, parser=SampleHTMLParser(), workers=8)