Пример #1
0
def compress(read_file_path, write_path_path):
    num_of_chunks, chunk_size = util.chunk_file(read_file_path)
    with open(read_file_path, **file_access_modes.default_read_configuration) as read_stream, \
            open(write_path_path, **file_access_modes.write_bytes_configuration) as write_stream:

        dictionary = _generate_dictionary()
        read_limit = chunk_size

        initial_phrase = _empty_str
        compression_end = False
        compressed_rest = _empty_str
        for chunk_number in range(1, num_of_chunks + 1):
            if chunk_number == num_of_chunks:
                read_limit = None
                compression_end = True

            data = read_stream.read(read_limit)
            compressed_data, initial_phrase = _compress_data(
                data,
                dictionary,
                initial_phrase=initial_phrase,
                compression_end=compression_end)

            compressed_data = compressed_rest + compressed_data

            integer_num_of_bytes, compressed_rest = util.extract_integer_num_of_bytes(
                compressed_data)

            _write_bytes(write_stream, integer_num_of_bytes)

        _write_bytes(write_stream, compressed_rest)
Пример #2
0
def decompress(read_file_path, write_file_path, *, code_type):
    num_of_chunks, chunk_size = util.chunk_file(read_file_path)

    code_function = _code_functions[code_type]
    ending_bit = _ending_bits[code_type]
    read_code_function = _read_code_functions[code_type]

    characters_by_frequency = _characters_by_frequencies(read_file_path)
    codes = util.generate_codes(characters_by_frequency, code_function)
    reversed_codes = util.reverse_dictionary(codes)

    rest_bits = _empty_str
    read_limit = chunk_size
    compression_end = False
    with open(read_file_path, **file_access_modes.read_bytes_configuration) as read_stream, \
            open(write_file_path, **file_access_modes.default_write_configuration) as write_stream:
        read_stream.seek(len(characters_by_frequency) + 1)

        for chunk_number in range(1, num_of_chunks + 1):
            if chunk_number == num_of_chunks:
                read_limit = None
                compression_end = True

            binary_data = read_stream.read(read_limit)
            bits = rest_bits + util.to_bits(binary_data)

            decompressed_data, rest_bits = _decompress_data(
                bits,
                reversed_codes,
                read_code_function=read_code_function,
                ending_bit=ending_bit,
                compression_end=compression_end)
            write_stream.write(decompressed_data)
Пример #3
0
def convert_singpairmix_to_tf_examples(dataset_name, processed_data_dir, tf_example_dir, dataset_split='all'):
    out_dir = os.path.join(tf_example_dir, dataset_name)
    out_full_dir = os.path.join(out_dir, 'all')
    util.create_dirs(out_full_dir)
    if dataset_split == 'all':
        if dataset_name == 'duc_2004':
            dataset_splits = ['test']
        else:
            dataset_splits = ['test', 'val', 'train']
    else:
        dataset_splits = [dataset_split]
    for dataset_split in dataset_splits:
        processed_data_path = os.path.join(processed_data_dir, dataset_name, dataset_split)
        articles_path = os.path.join(processed_data_path,'articles.tsv')
        abstracts_path = os.path.join(processed_data_path,'summaries.tsv')
        highlight_path = os.path.join(processed_data_path,'highlight.tsv')

        f_art = open(articles_path)
        f_abs = open(abstracts_path)
        f_hl = open(highlight_path)
        writer = open(os.path.join(out_full_dir, dataset_split + '.bin'), 'wb')
        total = util.num_lines_in_file(articles_path)
        for example_idx in tqdm(range(total)):
            raw_article_sents = f_art.readline().strip().split('\t')
            groundtruth_summ_sents = f_abs.readline().strip().split('\t')
            summary_text = '\n'.join(groundtruth_summ_sents)
            article_sent_tokens = [util.process_sent(sent, whitespace=True) for sent in raw_article_sents]
            doc_indices = None
            if doc_indices is None or (dataset_name != 'duc_2004' and len(doc_indices) != len(
                    util.flatten_list_of_lists(article_sent_tokens))):
                doc_indices = [0] * len(util.flatten_list_of_lists(article_sent_tokens))
            doc_indices_str = ' '.join([str(idx) for idx in doc_indices])
            similar_source_indices = [source_indices.split(',') for source_indices in f_hl.readline().split('\t')]

            write_bert_tf_example(similar_source_indices, raw_article_sents, summary_text, None,
                                  doc_indices_str, None, writer, dataset_name)

        writer.close()
        if dataset_name == 'cnn_dm' or dataset_name == 'newsroom' or dataset_name == 'xsum':
            chunk_size = 1000
        else:
            chunk_size = 1
        util.chunk_file(dataset_split, out_full_dir, out_dir, chunk_size=chunk_size)
Пример #4
0
def decompress(read_file_path, write_path_path):
    num_of_chunks, chunk_size = util.chunk_file(read_file_path)

    with open(read_file_path, **file_access_modes.read_bytes_configuration) as read_stream, \
            open(write_path_path, **file_access_modes.default_write_configuration) as write_stream:
        dictionary = _generate_dictionary()
        reversed_dictionary = util.reverse_dictionary(dictionary)

        rest_bits = _empty_str
        initial_phrase = _empty_str
        read_limit = chunk_size
        for chunk_number in range(1, num_of_chunks + 1):
            if chunk_number == num_of_chunks:
                read_limit = None

            binary_data = read_stream.read(read_limit)
            bits = rest_bits + util.to_bits(binary_data)

            decompressed_data, rest_bits, initial_phrase = _decompress_data(
                bits,
                dictionary,
                reversed_dictionary,
                initial_phrase=initial_phrase)
            write_stream.write(decompressed_data)
Пример #5
0
def compress(read_stream_path, write_stream_path, *, code_type):
    code_function = _code_functions[code_type]
    ending_bit = _ending_bits[code_type]

    num_of_threads, thread_chunk = util.chunk_file(read_stream_path)

    results_queue = queue.PriorityQueue()
    threads = list()

    read_limit = thread_chunk
    for thread_number in range(1, num_of_threads + 1):
        if thread_number == num_of_threads:
            read_limit = None

        read_stream_start_position = thread_chunk * (thread_number - 1)
        thread_result_file_path = util.thread_result_file_path(
            write_stream_path, thread_number)

        threading_data = (results_queue, thread_number,
                          read_stream_start_position, read_limit)
        thread = threading.Thread(target=_compress_file_content,
                                  args=(read_stream_path,
                                        thread_result_file_path),
                                  kwargs={
                                      _threading_data_parameter:
                                      threading_data,
                                      _code_function_parameter: code_function,
                                      _ending_bit_parameter: ending_bit
                                  })
        thread.start()
        threads.append(thread)

    for thread in threads:
        thread.join()

    _combine_threads_results(results_queue, write_stream_path, num_of_threads)
def main(unused_argv):

    print('Running statistics on %s' % FLAGS.exp_name)

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    if FLAGS.all_actions:
        FLAGS.sent_dataset = True
        FLAGS.ssi_dataset = True
        FLAGS.print_output = True
        FLAGS.highlight = True

    original_dataset_name = 'xsum' if 'xsum' in FLAGS.dataset_name else 'cnn_dm' if (
        'cnn_dm' in FLAGS.dataset_name
        or 'duc_2004' in FLAGS.dataset_name) else ''
    vocab = Vocab(FLAGS.vocab_path + '_' + original_dataset_name,
                  FLAGS.vocab_size)  # create a vocabulary

    source_dir = os.path.join(data_dir, FLAGS.dataset_name)
    util.create_dirs(html_dir)

    if FLAGS.dataset_split == 'all':
        if FLAGS.dataset_name == 'duc_2004':
            dataset_splits = ['test']
        else:
            dataset_splits = ['test', 'val', 'train']
    else:
        dataset_splits = [FLAGS.dataset_split]
    for dataset_split in dataset_splits:
        source_files = sorted(glob.glob(source_dir + '/' + dataset_split +
                                        '*'))
        if FLAGS.exp_name == 'reference':
            # summary_dir = log_dir + default_exp_name + '/decode_test_' + str(max_enc_steps) + \
            #                 'maxenc_4beam_' + str(min_dec_steps) + 'mindec_' + str(max_dec_steps) + 'maxdec_ckpt-238410/reference'
            # summary_files = sorted(glob.glob(summary_dir + '/*_reference.A.txt'))
            summary_dir = source_dir
            summary_files = source_files
        else:
            if FLAGS.exp_name == 'cnn_dm':
                summary_dir = log_dir + FLAGS.exp_name + '/decode_test_400maxenc_4beam_35mindec_100maxdec_ckpt-238410/decoded'
            else:
                ckpt_folder = util.find_largest_ckpt_folder(log_dir +
                                                            FLAGS.exp_name)
                summary_dir = log_dir + FLAGS.exp_name + '/' + ckpt_folder + '/decoded'
                # summary_dir = log_dir + FLAGS.exp_name + '/decode_test_' + str(max_enc_steps) + \
                #             'maxenc_4beam_' + str(min_dec_steps) + 'mindec_' + str(max_dec_steps) + 'maxdec_ckpt-238410/decoded'
            summary_files = sorted(glob.glob(summary_dir + '/*'))
        if len(summary_files) == 0:
            raise Exception('No files found in %s' % summary_dir)
        example_generator = data.example_generator(source_dir + '/' +
                                                   dataset_split + '*',
                                                   True,
                                                   False,
                                                   is_original=True)
        pros = {
            'annotators': 'dcoref',
            'outputFormat': 'json',
            'timeout': '5000000'
        }
        all_merge_examples = []
        num_extracted_list = []
        distances = []
        relative_distances = []
        html_str = ''
        extracted_sents_in_article_html = ''
        name = FLAGS.dataset_name + '_' + FLAGS.exp_name
        if FLAGS.coreference_replacement:
            name += '_coref'
        highlight_file_name = os.path.join(
            html_dir, FLAGS.dataset_name + '_' + FLAGS.exp_name)
        if FLAGS.consider_stopwords:
            highlight_file_name += '_stopwords'
        if FLAGS.highlight:
            extracted_sents_in_article_html_file = open(
                highlight_file_name + '_extracted_sents.html', 'wb')
        if FLAGS.kaiqiang:
            kaiqiang_article_texts = []
            kaiqiang_abstract_texts = []
            util.create_dirs(kaiqiang_dir)
            kaiqiang_article_file = open(
                os.path.join(
                    kaiqiang_dir, FLAGS.dataset_name + '_' + dataset_split +
                    '_' + str(FLAGS.min_matched_tokens) + '_articles.txt'),
                'wb')
            kaiqiang_abstract_file = open(
                os.path.join(
                    kaiqiang_dir, FLAGS.dataset_name + '_' + dataset_split +
                    '_' + str(FLAGS.min_matched_tokens) + '_abstracts.txt'),
                'wb')
        if FLAGS.ssi_dataset:
            if FLAGS.tag_tokens:
                with_coref_and_ssi_dir = lambdamart_dir + '_and_tag_tokens'
            else:
                with_coref_and_ssi_dir = lambdamart_dir
            lambdamart_out_dir = os.path.join(with_coref_and_ssi_dir,
                                              FLAGS.dataset_name)
            if FLAGS.sentence_limit == 1:
                lambdamart_out_dir += '_singles'
            if FLAGS.consider_stopwords:
                lambdamart_out_dir += '_stopwords'
            lambdamart_out_full_dir = os.path.join(lambdamart_out_dir, 'all')
            util.create_dirs(lambdamart_out_full_dir)
            lambdamart_writer = open(
                os.path.join(lambdamart_out_full_dir, dataset_split + '.bin'),
                'wb')

        simple_similar_source_indices_list_plus_empty = []
        example_idx = -1
        instance_idx = 0
        total = len(source_files) * 1000 if (
            'cnn' in FLAGS.dataset_name or 'newsroom' in FLAGS.dataset_name
            or 'xsum' in FLAGS.dataset_name) else len(source_files)
        random_choices = None
        if FLAGS.randomize:
            if FLAGS.dataset_name == 'cnn_dm':
                list_order = np.random.permutation(11490)
                random_choices = list_order[:FLAGS.num_instances]
        for example in tqdm(example_generator, total=total):
            example_idx += 1
            if FLAGS.num_instances != -1 and instance_idx >= FLAGS.num_instances:
                break
            if random_choices is not None and example_idx not in random_choices:
                continue
        # for file_idx in tqdm(range(len(source_files))):
        #     example = get_tf_example(source_files[file_idx])
            article_text = example.features.feature[
                'article'].bytes_list.value[0].decode().lower()
            if FLAGS.exp_name == 'reference':
                summary_text, all_summary_texts = get_summary_from_example(
                    example)
            else:
                summary_text = get_summary_text(summary_files[example_idx])
            article_tokens = split_into_tokens(article_text)
            if 'raw_article_sents' in example.features.feature and len(
                    example.features.feature['raw_article_sents'].bytes_list.
                    value) > 0:
                raw_article_sents = example.features.feature[
                    'raw_article_sents'].bytes_list.value

                raw_article_sents = [
                    sent.decode() for sent in raw_article_sents
                    if sent.decode().strip() != ''
                ]
                article_sent_tokens = [
                    util.process_sent(sent, whitespace=True)
                    for sent in raw_article_sents
                ]
            else:
                # article_text = util.to_unicode(article_text)

                # sent_pros = {'annotators': 'ssplit', 'outputFormat': 'json', 'timeout': '5000000'}
                # sents_result_dict = nlp.annotate(str(article_text), properties=sent_pros)
                # article_sent_tokens = [[token['word'] for token in sent['tokens']] for sent in sents_result_dict['sentences']]

                raw_article_sents = nltk.tokenize.sent_tokenize(article_text)
                article_sent_tokens = [
                    util.process_sent(sent) for sent in raw_article_sents
                ]
            if FLAGS.top_n_sents != -1:
                article_sent_tokens = article_sent_tokens[:FLAGS.top_n_sents]
                raw_article_sents = raw_article_sents[:FLAGS.top_n_sents]
            article_sents = [' '.join(sent) for sent in article_sent_tokens]
            try:
                article_tokens_string = str(' '.join(article_sents))
            except:
                try:
                    article_tokens_string = str(' '.join(
                        [sent.decode('latin-1') for sent in article_sents]))
                except:
                    raise

            if len(article_sent_tokens) == 0:
                continue

            summary_sent_tokens = split_into_sent_tokens(summary_text)
            if 'doc_indices' in example.features.feature and len(
                    example.features.feature['doc_indices'].bytes_list.value
            ) > 0:
                doc_indices_str = example.features.feature[
                    'doc_indices'].bytes_list.value[0].decode()
                if '1' in doc_indices_str:
                    doc_indices = [
                        int(x) for x in doc_indices_str.strip().split()
                    ]
                    rel_sent_positions = importance_features.get_sent_indices(
                        article_sent_tokens, doc_indices)
                else:
                    num_tokens_total = sum(
                        [len(sent) for sent in article_sent_tokens])
                    rel_sent_positions = list(range(len(raw_article_sents)))
                    doc_indices = [0] * num_tokens_total

            else:
                rel_sent_positions = None
                doc_indices = None
                doc_indices_str = None
            if 'corefs' in example.features.feature and len(
                    example.features.feature['corefs'].bytes_list.value) > 0:
                corefs_str = example.features.feature[
                    'corefs'].bytes_list.value[0]
                corefs = json.loads(corefs_str)
            # summary_sent_tokens = limit_to_n_tokens(summary_sent_tokens, 100)

            similar_source_indices_list_plus_empty = []

            simple_similar_source_indices, lcs_paths_list, article_lcs_paths_list, smooth_article_paths_list = ssi_functions.get_simple_source_indices_list(
                summary_sent_tokens,
                article_sent_tokens,
                vocab,
                FLAGS.sentence_limit,
                FLAGS.min_matched_tokens,
                not FLAGS.consider_stopwords,
                lemmatize=FLAGS.lemmatize,
                multiple_ssi=FLAGS.multiple_ssi)

            article_paths_parameter = article_lcs_paths_list if FLAGS.tag_tokens else None
            article_paths_parameter = smooth_article_paths_list if FLAGS.smart_tags else article_paths_parameter
            restricted_source_indices = util.enforce_sentence_limit(
                simple_similar_source_indices, FLAGS.sentence_limit)
            for summ_sent_idx, summ_sent in enumerate(summary_sent_tokens):
                if FLAGS.sent_dataset:
                    if len(restricted_source_indices[summ_sent_idx]) == 0:
                        continue
                    merge_example = get_merge_example(
                        restricted_source_indices[summ_sent_idx],
                        article_sent_tokens, summ_sent, corefs,
                        article_paths_parameter[summ_sent_idx])
                    all_merge_examples.append(merge_example)

            simple_similar_source_indices_list_plus_empty.append(
                simple_similar_source_indices)
            if FLAGS.ssi_dataset:
                summary_text_to_save = [
                    s for s in all_summary_texts
                ] if FLAGS.dataset_name == 'duc_2004' else summary_text
                write_lambdamart_example(simple_similar_source_indices,
                                         raw_article_sents,
                                         summary_text_to_save, corefs_str,
                                         doc_indices_str,
                                         article_paths_parameter,
                                         lambdamart_writer)

            if FLAGS.highlight:
                highlight_article_lcs_paths_list = smooth_article_paths_list if FLAGS.smart_tags else article_lcs_paths_list
                # simple_ssi_plus_empty = [ [s[0] for s in sim_source_ind] for sim_source_ind in simple_similar_source_indices]
                extracted_sents_in_article_html = ssi_functions.html_highlight_sents_in_article(
                    summary_sent_tokens, simple_similar_source_indices,
                    article_sent_tokens, doc_indices, lcs_paths_list,
                    highlight_article_lcs_paths_list)
                extracted_sents_in_article_html_file.write(
                    extracted_sents_in_article_html.encode())
            a = 0

            instance_idx += 1

        if FLAGS.ssi_dataset:
            lambdamart_writer.close()
            if FLAGS.dataset_name == 'cnn_dm' or FLAGS.dataset_name == 'newsroom' or FLAGS.dataset_name == 'xsum':
                chunk_size = 1000
            else:
                chunk_size = 1
            util.chunk_file(dataset_split,
                            lambdamart_out_full_dir,
                            lambdamart_out_dir,
                            chunk_size=chunk_size)

        if FLAGS.sent_dataset:
            with_coref_dir = data_dir + '_and_tag_tokens' if FLAGS.tag_tokens else data_dir
            out_dir = os.path.join(with_coref_dir,
                                   FLAGS.dataset_name + '_sent')
            if FLAGS.sentence_limit == 1:
                out_dir += '_singles'
            if FLAGS.consider_stopwords:
                out_dir += '_stopwords'
            if FLAGS.coreference_replacement:
                out_dir += '_coref'
            if FLAGS.top_n_sents != -1:
                out_dir += '_n=' + str(FLAGS.top_n_sents)
            util.create_dirs(out_dir)
            convert_data.write_with_generator(iter(all_merge_examples),
                                              len(all_merge_examples), out_dir,
                                              dataset_split)

        if FLAGS.print_output:
            # html_str = FLAGS.dataset + ' | ' + FLAGS.exp_name + '<br><br><br>' + html_str
            # save_fusions_to_file(html_str)
            ssi_path = os.path.join(ssi_dir, FLAGS.dataset_name)
            if FLAGS.consider_stopwords:
                ssi_path += '_stopwords'
            util.create_dirs(ssi_path)
            if FLAGS.dataset_name == 'duc_2004' and FLAGS.abstract_idx != 0:
                abstract_idx_str = '_%d' % FLAGS.abstract_idx
            else:
                abstract_idx_str = ''
            with open(
                    os.path.join(
                        ssi_path,
                        dataset_split + '_ssi' + abstract_idx_str + '.pkl'),
                    'wb') as f:
                pickle.dump(simple_similar_source_indices_list_plus_empty, f)

        if FLAGS.kaiqiang:
            # kaiqiang_article_file.write('\n'.join(kaiqiang_article_texts))
            # kaiqiang_abstract_file.write('\n'.join(kaiqiang_abstract_texts))
            kaiqiang_article_file.close()
            kaiqiang_abstract_file.close()
        if FLAGS.highlight:
            extracted_sents_in_article_html_file.close()
        a = 0