Exemplo n.º 1
0
def unescape_all_tables(search_path):
    for filename in get_filenames(search_path):
        print(f'Un-escaping file: {filename}')
        parts = filename.split('.')
        out_filename = '.'.join(parts[:-1]) + '.unescaped'
        converted = html.unescape(read_file(filename))
        write_file(out_filename, converted)
Exemplo n.º 2
0
def aggregate_sw_csvs(direc, output_name):
    csv_files = read_files_in_direc(direc, file_ext='.csv')
    X = []
    y = []
    prev_speaker = ''
    for f in tqdm(csv_files):
        with open(f, 'r') as readfile:
            csv_reader = csv.reader(readfile, delimiter=',')
            for idx, row in enumerate(csv_reader):
                if (idx == 0):
                    continue
                X_ele = filter_pos_words(row[9])
                if len(X_ele) <= 0:
                    continue
                X_start_with_I = '1' if X_ele[0].startswith("I/") else '0'
                X_len = str(len(X_ele.split(' ')))
                if idx > 1 and row[5] == prev_speaker:
                    X_speaker = '1'
                else:
                    X_speaker = '0'
                prev_speaker = row[5]
                X.append(' '.join([X_ele, X_start_with_I, X_len, X_speaker]))
                y.append(convert_sw_to_y(row[4]))
            X.append('')
            y.append('')
    write_file(X, output_name + '_X.csv')
    write_file(y, output_name + '_y.csv')
Exemplo n.º 3
0
def write_dict(X, write_fn):
    pos_words = []
    for line in X:
        for pos in line:
            if pos not in pos_words:
                pos_words.append(pos)
    write_file(pos_words, write_fn)
Exemplo n.º 4
0
 def file_flush(update_type):
     if update_type == SINGLE_FILE:
         write_file(os.path.join(output_path, 'tokenized'),
                    '\n'.join(update_tokens))
     else:
         write_file(
             os.path.join(output_path, 'separate_files', 'file_list'),
             '\n'.join(separate_files))
Exemplo n.º 5
0
 def generate_all(self, f_name, f_contents):
     """
     Given a function f_name : item -> string and f_contents : item -> bytestring, calls them for each item
     to compute the output file name, the output file contents, and to write it out.
     """
     for x in self.all():
         write_file(self.output_filename(f_name(x)), (f_contents(x)))
     return self
Exemplo n.º 6
0
 def render_all_templates(self, **kwargs):
     """Render all items as templates."""
     for x in self.all():
         debug('render all templates: {}'.format(x))
         template = self.env.get_template(x)
         # Here we pass 2 extra parameters to the template:
         #   - name     : the loading name of the template (e.g. base file name)
         #   - filename : the filepath of the template in the filesystem (if any)
         html = template.render(name=template.name, filename=template.filename, **kwargs)
         write_file(self.output_filename(x), html.encode(encoding='utf-8', errors='strict'))
     return self
Exemplo n.º 7
0
def remove_single_parens(search_path):

    for filename in get_filenames(search_path):
        # filename = '/Volumes/datadrive/generated-html-json/0001035713_providian_financial_corp__10-k__2004-01-01_2004-12-31_10-k__tables-extracted_split-tables__24.table-extracted'

        print(f'Removing single parens from file: {filename}')
        parts = filename.split('.')
        out_filename = '.'.join(parts[:-1]) + '.remove-single-parens'

        top_tag = handle_single_parens(read_file(filename))
        write_file(out_filename, str(top_tag))
Exemplo n.º 8
0
 def on_train_end(self, logs={}):
     data = {
         "losses": self.losses,
         "accuracies": self.accs,
         "val_losses": self.val_losses,
         "val_accuracies": self.val_accs,
         "val_f1s": self.val_f1s,
         "val_recalls": self.val_recalls,
         "val_precisions": self.val_precisions
     }
     write_file(data,
                f"../results/{model_name}-results.json",
                is_json=True)
Exemplo n.º 9
0
def clean_all_tables(input_paths):
    for filename in get_filenames(input_paths):
        prefix = filename.split(os.sep)[-1].split('.')[0]
        out_filename = os.path.join(generated_html_json_dir(),
                                    prefix + '.cleaned')

        print(f'filename: {filename}')
        table_tag = BeautifulSoup(read_file(filename), 'html.parser')

        remove_tags(table_tag)

        out_dirname_parts = out_filename.split(os.sep)[:-1]
        ensure_dir_exists(os.path.join(os.sep, *out_dirname_parts))

        write_file(out_filename, table_tag.prettify())
Exemplo n.º 10
0
def reset_next_build_numb(output):
    next_build_number = output + "/nextBuildNumber"
    index = 1
    data = "%d" % (index + 1)
    if not exists_path(next_build_number):
        make_nod(next_build_number)
    else:
        index = int(read_file(next_build_number)["data"])
        data = "%d" % (index + 1)
    write_file(next_build_number, data)

    out = output + "/%d" % index
    if not exists_path(output):
        mk_dirs(output)

    return (out, index)
Exemplo n.º 11
0
def decode_file(filename, tokens):

    with open(filename, 'r') as f:
        numbers = list(map(int, f.read().split()))

        result = []
        idx = 0
        while(idx < len(numbers)):

            num = numbers[idx]
            if num == Number.PADDING.value:
                idx += 1
                continue

            if num == (Number.START_SEQUENCE.value):
                try:
                    is_negative, num, is_fraction, is_percent = \
                        numbers[idx+1], numbers[idx+2], numbers[idx+3], \
                        numbers[idx+4]
                except IndexError:
                    import pdb; pdb.set_trace()
                    pass
                if is_negative:
                    num = -num
                if is_fraction:
                    num = format(convert_whole_to_fraction(num), '.2f')
                else:
                    num = str(num)
                if is_percent:
                    num += '%'
                result.append(num)
                idx += 6  # TODO: this should be in NumberSequence?
                continue

            result.append(tokens[str(num)])
            idx += 1

        fn_parts = filename.split(os.sep)
        fn_prefix_index = fn_parts[-1].rfind('.')
        fn_prefix = fn_parts[-1][:fn_prefix_index]

        dir_name = os.path.join(os.sep.join(filename.split(os.sep)[:-1]),
                                'decoded')
        create_dirs([dir_name])

        out_filename = os.path.join(dir_name, fn_prefix + '.decoded')
        write_file(out_filename, ' '.join(result))
Exemplo n.º 12
0
def reset_last_status(result, output, index):
    stats = result.statistics
    fail = stats.total.critical.failed

    last_fail = output + "/lastFail"
    last_passed = output + "/lastPassed"
    data = "%d" % index

    if fail != 0:
        if not exists_path(last_fail):
            make_nod(last_fail)

        write_file(last_fail, data)
    else:
        if not exists_path(last_passed):
            make_nod(last_passed)
        write_file(last_passed, data)
Exemplo n.º 13
0
    def __save(self, args):
        result = {"status": "success", "msg": "保存成功"}

        user_path = self.app.config["AUTO_HOME"] + "/workspace/%s%s" % (session["username"], args["path"])

        if not write_file(user_path, args["data"]):
            result["status"] = "fail"
            result["msg"] = "保存失败"

        return result
Exemplo n.º 14
0
def html_to_json():
    output_dirname = os.path.join(generated_html_json_dir())
    os.makedirs(output_dirname, exist_ok=True)

    result_string = ''
    num_all_files = 0
    num_files_processed = 0
    for full_filepath in get_filenames(html_samples_dir(), 'html_input', '*'):
        # full_filepath = './data/extract/samples/html/html_input/1.html'
        filename = full_filepath.split(os.sep)[-1].lower()

        if not filename.endswith('table-extracted'):
            continue
        print(f'{num_all_files}: full_filepath: {full_filepath}')
        result_string += full_filepath + '\n'

        num_all_files += 1
        html_to_image(full_filepath)
        json_data, error_str = image_to_json('out.png')
        if json_data is None:
            result_string += traceback.format_exc() + '\n\n'
        else:
            num_files_processed += 1
            output_filename = \
                os.path.join(output_dirname,
                            filename.split('.')[0] + '.json')
            print(f'output_filename: {output_filename}')
            write_json_to_file(output_filename, json_data)

            output_html_filename = os.path.join(output_dirname, filename)
            copy_file(full_filepath, output_html_filename)

    result_stats = f'num_files_processed: {num_files_processed}\n' \
        f'num_all_files: {num_all_files}\n' \
        f'success ratio: {num_files_processed / num_all_files}\n'
    print(result_stats)
    result_string += result_stats
    write_file(os.path.join(output_dirname, 'html_to_json_processing_results'),
               result_string)
Exemplo n.º 15
0
    def file_update(html_fn,
                    html_tokens,
                    json_fn,
                    json_tokens,
                    update_type=SINGLE_FILE):
        if update_type == SINGLE_FILE:
            update_tokens.append(html_fn + '^' + html_tokens + \
                '^' + json_fn + '^' + json_tokens)
        else:  # multiple files created - one for each set
            # of (html, json) input files
            update_tokens.append((html_fn, json_fn))
            create_dirs(os.path.join(output_path, 'separate_files'))

            output_html_fn = os.path.join(
                output_path, 'separate_files',
                html_fn.split(os.sep)[-1] + '.tokenized')
            output_json_fn = os.path.join(
                output_path, 'separate_files',
                json_fn.split(os.sep)[-1] + '.tokenized')
            separate_files.append(output_html_fn + '^' + output_json_fn)
            write_file(output_html_fn, html_tokens)
            write_file(output_json_fn, json_tokens)
Exemplo n.º 16
0
    def reset_last_status(self, index):
        stats = self.result.statistics
        fail = stats.total.critical.failed

        lock = threading.Lock()

        lock.acquire()
        last_fail = self.output + "/lastFail"
        last_passed = self.output + "/lastPassed"
        data = "%d" % index

        if fail != 0:
            if not exists_path(last_fail):
                make_nod(last_fail)

            write_file(last_fail, data)
        else:
            if not exists_path(last_passed):
                make_nod(last_passed)
            write_file(last_passed, data)

        lock.release()
Exemplo n.º 17
0
def generate_random_text(input_filenames, num_output_files):
    print('Getting set of all chars in data', end='')
    print(' ... done')

    for id in range(num_output_files):
        input_fn = np.random.choice(input_filenames)
        # input_fn = '/Volumes/datadrive/generated-html-json/0001035713_providian_financial_corp__10-k__2004-01-01_2004-12-31_10-k__tables-extracted_split-tables__24.unescaped'

        # To be done again as some of the numbers that should be empty are 9's,
        # even in the html page.
        print('{:6d}: file: {}'.format(id, input_fn))

        fn_parts = input_fn.split(os.sep)
        fn_name = fn_parts[-1].split('.')
        fn_prefix, fn_type = fn_name[0], fn_name[1]

        json_input_fn = os.sep + os.path.join(*fn_parts[:-1],
                                              fn_prefix + '.json')
        json_generated_output_fn = os.path.join(generated_data_dir(),
                                                'html',
                                                str(id) + '.' + fn_type)
        json_expected_output_fn = os.path.join(generated_data_dir(),
                                               'expected_json',
                                               str(id) + '.expected_json')

        input_generated_fn = os.path.join(generated_data_dir(),
                                          'input',
                                          str(id) + '.input')

        generated_input, json_expected = \
            generate_input(input_fn,
                           fn_type,
                           json_input_fn)

        write_file(json_generated_output_fn, generated_input)
        write_json_to_file(json_expected_output_fn, json_expected)
        copy_file(input_fn, input_generated_fn)
Exemplo n.º 18
0
    def __save(self, args):
        result = {"status": "success", "msg": "成功:保存成功."}
        user_path = args["key"]

        if not write_file(user_path, args["data"]):
            result["status"] = "fail"
            result["msg"] = "失败:保存失败"

        if user_path.endswith('.robot'):
            self.app.config['DB'].refresh_caseinfo(user_path, 'force')
            self.app.config['DB'].insert_loginfo(session['username'], 'suite',
                                                 'edit', user_path,
                                                 result['status'])

        if user_path.endswith(
                '.resource'):  # delete keywords or update highlight
            update_resource(user_path)

        return result
Exemplo n.º 19
0
 def render(self, output_file, **kwargs):
     template = self.env.get_template(self.template_file)
     html = template.render(**kwargs)
     write_file(self.output_filename(output_file), html.encode(encoding='utf-8', errors='strict'))
     return self