예제 #1
0
def main():
  properties = factbook_utils.read_properties('factbook_ingest.properties')
  # Input directory
  input_dir = properties['corpus.dir.root'] + '/intermediate/' + properties['corpus.abbreviation'] + '-' + properties['corpus.date'] + '/countries_html'
  print('Input directory: ' + input_dir)

  # Create output directory.
  output_dir = properties['corpus.dir.root'] + '/intermediate/' + properties['corpus.abbreviation'] + '-' + properties['corpus.date'] + '/countries_detag'
  print('Output directory: ' + output_dir)
  if not os.path.exists(output_dir):
    os.makedirs(output_dir)

  # Get list of HTML files from the input directory.
  file_names = os.listdir(input_dir)

  # Process each .html file
  ext = ".html"
  counter = 0
  for file_name in file_names:
    # Files we are interested in are named: 'xx.html'
    if len(file_name) is not 7:
      print('Skipping ' + file_name)
      continue
      
    # Files must end in '.html'
    if file_name.lower().endswith(ext):
      infile = os.path.join(input_dir, file_name)
      outfile = file_name[:2] + ".xml"
      outfile = os.path.join(output_dir, outfile)
      process_file(infile, outfile)
      counter += 1
      
  print('Processed ' + str(counter) + ' files')
예제 #2
0
def main():
    properties = factbook_utils.read_properties("factbook_ingest.properties")
    input_zipfile = (
        properties["corpus.dir.root"]
        + "/input/"
        + properties["corpus.abbreviation"]
        + "-"
        + properties["corpus.date"]
        + "/factbook.zip"
    )
    output_dir = (
        properties["corpus.dir.root"]
        + "/intermediate/"
        + properties["corpus.abbreviation"]
        + "-"
        + properties["corpus.date"]
        + "/countries_html"
    )
    zip = zipfile.ZipFile(input_zipfile, "r")
    # extract country files
    for member in zip.namelist():
        filename = os.path.basename(member)
        dirname = os.path.basename(os.path.dirname(member))
        # With the latest 2014 version of Factbook, the countrytemplate files no longer exist
        # We will unzip any HTML files in the geos directory (updated code on 2/24/2014)
        # if (filename.startswith('countrytemplate')
        if (
            filename.endswith(".html")
            # ignore the following countries, because the files contain no data
            and not dirname.find("geos") < 0
            and not filename.endswith("_hq.html")
            and not filename.endswith("_jq.html")
            and not filename.endswith("_mq.html")
            and not filename.endswith("_dq.html")
            and not filename.endswith("_lq.html")
            and not filename.endswith("_kq.html")
            and not filename.endswith("_fq.html")
            and not filename.endswith("_va.html")
            and not filename.endswith("_ss.html")
            and not filename.endswith("_xx.html")
        ):
            zip.extract("factbook/geos/" + filename, output_dir)
    zip.close()
    # move country files to correct location
    src_dir = output_dir + "/factbook/geos/"
    file_names = os.listdir(src_dir)
    for file_name in file_names:
        short_name = file_name.replace("countrytemplate_", "")
        shutil.move(src_dir + file_name, output_dir + "/" + short_name)
    os.rmdir(output_dir + "/factbook/geos")
    os.rmdir(output_dir + "/factbook")
    print("Done.")
예제 #3
0
def main():
    properties = factbook_utils.read_properties('factbook_ingest.properties')
#/apps/preproc/r1.5.0/factbook/intermediate/fb-20130218/countries_detag
    input_dir = properties['corpus.dir.root'] + '/intermediate/' + properties['corpus.abbreviation'] + '-' + properties['corpus.date'] + '/countries_detag'
    output_dir = properties['corpus.dir.root'] + '/intermediate/' + properties['corpus.abbreviation'] + '-' + properties['corpus.date'] + '/xml-splitTrecTrim'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)    
    file_names = os.listdir(input_dir)
    docno_counter = 0 
    file_names.sort()
    for file_name in file_names:
        infile = open(input_dir + '/' + file_name)
        outfile = open(output_dir + '/' + file_name, 'w')
        process_file(infile, outfile, docno_counter)
        docno_counter += 1
    print('Done')