Пример #1
0
def run():
    k = args.minlength
    words = sorted(word.lower() for word in dx1.read_file(args.file)
                   if len(word) >= k)
    chunk_trie_ltr, chunk_trie_rtl = make_tries(words, k)

    if args.output_file:
        with open(args.output_file, 'w') as output_file:
            produce_output(chunk_trie_ltr, chunk_trie_rtl, output_file)
    else:
        produce_output(chunk_trie_ltr, chunk_trie_rtl, sys.stdout)
Пример #2
0
def run_lt(strings, min_count, verbose=False):
    seqs = get_sequences(strings, min_count, verbose=False)
    output_sequences(seqs)

def run_ngrams(tokens_fp, min_count, verbose=False):
    """
    Run with string tokens instead of characters.
    :param tokens_fp: path to a binary object with string tokens
    :param min_count: minimum n-gram count to include in the output
    :param verbose: trigger verbose mode
    :return:
    """
    with open(tokens_fp, 'rb') as tokens_file:
        tokens = pickle.load(tokens_file)

    run_lt(tokens, min_count, verbose=verbose)



if __name__ == '__main__':
    arg_parser = ArgumentParser()
    arg_parser.add_argument('file', help='dx1 file with strings')
    arg_parser.add_argument('--min-count', type=int, default=5)
    args = arg_parser.parse_args()
    import dx1
    strings = dx1.read_file(args.file)
    run_lt(strings, args.min_count)


Пример #3
0
        print_to_file('signatures for affixes')
        pprint.pprint(sort_by_size(signatures_affixes), stream=out_file)


    # output every word to a separate file
    words_file = Path(corpus_name + '_words.txt')
    with words_file.open('w') as out_file:
        for word in words:
            print(word, file=out_file)


if __name__ == '__main__':
    arg_parser = ArgumentParser()
    arg_parser.add_argument('file', help='dx1 file for input')
    arg_parser.add_argument('--min-length', type=int, default=5,
                            help='minimum substring length')
    arg_parser.add_argument('--num-words', type=int, default=200,
                            help='number of most frequently occurring strings to get')
    arg_parser.add_argument('--verbose', action='store_true', help='verbose mode')
    args = arg_parser.parse_args()

    data_file = Path(args.file)
    corpus_name = data_file.stem
    add_to_log = write_log(corpus_name)

    data = dx1.read_file(data_file)
    result = run(data, args.min_length, args.num_words, args.verbose)
    # output_result(result, corpus_name)
    add_to_log(None)