def process_sentences(file_path, pdf_name):
  sent_list_raw = open(file_path, 'r').readlines()
  sent_list_raw = [sent.rstrip('\n') for sent in sent_list_raw]
  sent_list_raw.pop(0) # Remove header column
  sent_list = []
  sent_obj = {}

  for sent in sent_list_raw:
    sent_split = sent.lower().split("\t")
    if len(sent_split) < 4:
      error_sents.append(sent_split)
      continue

    sent_id = sent_split.pop(0)
    sect_name = sent_split.pop(0)
    box_name = sent_split.pop(0)
    word_ids = sent_split.pop().split(",")
    word_array_spaces = re.split(word_split_pattern, " ".join(sent_split))
    word_array = [x for x in word_array_spaces if x not in [" ", "\t", ""]]
    sent_obj_obj = { 'sent_id': sent_id, 'sect_name': sect_name, 'box_name': box_name, 'text': " ".join(sent_split), 'word_array': word_array, 'word_ids': word_ids }

    # Filter out & ignore incorrectly split sentences by PDFNLT (incorrect word displayed in XHTML)
    if len(word_array) != len(word_ids):
      error_sents.append(sent_obj_obj)

    # Add extra metadata to word-array adn add sentence to list and object
    else:
      word_array_info = []
      # word_set = []
      for i, word in enumerate(word_array):
        temp_word = { 'text': word, 'word_id': word_ids[i] }
        word_array_info.append(temp_word)
        # word_set.append(word)

      sent_obj_obj['word_array_info'] = word_array_info
      # print(sent_obj_obj['word_array_info'])

      sent_list.append(sent_obj_obj)
      sent_obj[sent_id] = sent_obj_obj




  # Add some sentence split processing stats
  statistics.init()
  statistics.log_stat(f'{pdf_name.lower()}: # sentences incorrectly split by PDFNLT: {len(error_sents)}/{len(sent_list)}')
  statistics.log_stat(f'{pdf_name.lower()}: # entities rejected because entity.number_words > max_entity_words ({max_entity_words}): {number_entities_rejected}')

  # print(sent_obj['s-3-1-0-2']['word_array_info'])
  return sent_list, sent_obj, error_sents
    config['init_uninit_vars']      = args.init_uninit_vars
    config['redundant_test']        = args.redundant_test
    config['verbose']               = args.verbose
    config['build_before_instr']    = args.build_before_instr
    config['instr_printf']          = args.instr_printf
    config['mute_build_message']    = args.mute_build_message
    config['mute_test_message']     = args.mute_test_message
    config['mute_warning']          = args.mute_warning
    config['localize_only']         = args.localize_only
    config['invalid_localization']  = args.invalid_localization

    if args.verbose:
        for key, value in config.items():
            logger.info('option {} = {}'.format(key, value))

    statistics.init(working_dir)

    if args.ignore_lines:
        args.lines = None

    tool = Angelix(working_dir,
                   src=args.src,
                   buggy=args.buggy,
                   oracle=abspath(args.oracle),
                   tests=args.tests,
                   golden=args.golden,
                   asserts=asserts,
                   lines=args.lines,
                   build=args.build,
                   configure=args.configure,
                   config=config)
Пример #3
0
    config['build_before_instr'] = args.build_before_instr
    config['mute_build_message'] = args.mute_build_message
    config['mute_test_message'] = args.mute_test_message
    config['mute_warning'] = args.mute_warning
    config['build_validation_only'] = args.build_validation_only
    config['build_golden_only'] = args.build_golden_only
    config['build_backend_only'] = args.build_backend_only
    config['localize_only'] = args.localize_only
    config['invalid_localization'] = args.invalid_localization
    config['term_when_syn_crashes'] = args.term_when_syn_crashes

    if args.verbose:
        for key, value in config.items():
            logger.info('option {} = {}'.format(key, value))

    statistics.init(working_dir)

    if args.ignore_lines:
        args.lines = None

    tool = Angelix(working_dir,
                   src=args.src,
                   buggy=args.buggy,
                   oracle=abspath(args.oracle),
                   tests=args.tests,
                   golden=args.golden,
                   asserts=asserts,
                   lines=args.lines,
                   build=args.build,
                   configure=args.configure,
                   config=config)
Пример #4
0
        subproc_output = sys.stderr
    else:
        subproc_output = subprocess.DEVNULL

    if args.fuzz_results_dir is not None:
        copy(args.fuzz_results_dir, join(poracle_workdir, 'fuzz-results'))

    with open(args.config_file, "r") as read_file:
        data = json.load(read_file)
        target = data['target'] if 'target' in data else None
        delta_allowed = float(
            data['delta_allowed']) if 'delta_allowed' in data else 0
        data['delta_allowed'] = delta_allowed
        threadName = data['threadName'] if 'threadName' in data else 'main'

        statistics.init(poracle_workdir, args.config_file, poracle_config,
                        data)

        try:
            poracle = Poracle(poracle_config, poracle_workdir, deltas_dir,
                              data['project'], data['bug_id'], data['ID'],
                              target, delta_allowed, data['correctness'],
                              threadName)
            code = None
            code = poracle()
        except PoracleException as e:
            logger.error(e.get_msg())
        end_time = time.time()
        elapsed_time = end_time - start_time
        statistics.data['total_time'] = elapsed_time
        statistics.save()
        if code is not None: