def get_pattern_type_freq(ctx, infile_patterns, frequency_stats, outfile): pattern_types = collections.defaultdict(int) number = 0 stats = {} with open_file(frequency_stats) as infile: for line in infile: pattern, pstats = json.loads(line) stats[pattern] = pstats with open_file(infile_patterns, 'rb') as infile: while True: try: pattern, decoded_pattern = pickle.load(infile) number += 1 ctx.obj['logger'].info("Pattern " + str(number)) pattern_type = decoded_pattern.get_pattern_profile() pattern_types[pattern_type] += stats.get(pattern, {}).get( 'frequency', 1) except EOFError: break print(len(pattern_types)) with open_file(outfile, 'w') as o: json.dump(pattern_types, o)
def get_vocabulary_probs(ctx, vocabulary, outfile, add_smoothing): try: vocabularies = json.loads(vocabulary) except json.JSONDecodeError: with open_file(vocabulary) as dict_file: vocabularies = json.load(dict_file) vocabularies_probs = {} for level in vocabularies: freq = 0 entries = len(vocabularies[level]) probs = {} for entry in vocabularies[level]: freq += vocabularies[level][entry] for entry in vocabularies[level]: probs[entry] = (vocabularies[level][entry] + add_smoothing) / (freq + add_smoothing * entries) vocabularies_probs[level] = (probs, add_smoothing / (freq + add_smoothing * entries)) with open_file(outfile, 'w') as o: json.dump(vocabularies_probs, o)
def convert_pattern_list(ctx, infile, outfile, remove_hapax): def write_pattern(pattern, contents, outfile): json.dump((pattern, contents), outfile) outfile.write("\n") with open_file(infile) as infile: with open_file(outfile, 'w') as outfile: current_pattern = None contents = [] for line in infile: pattern, content = line.rstrip().split("\t") if pattern != current_pattern: if current_pattern is not None: if not remove_hapax or len(contents) > 1: write_pattern(current_pattern, contents, outfile) current_pattern = pattern contents = [] contents.append(content) if not remove_hapax or len(contents) > 1: write_pattern(current_pattern, contents, outfile)
def get_top_n_base_patterns(ctx, patterns_file, base_patterns_file, n, outfile, example_ids): with open_file(base_patterns_file) as infile: base_patterns = {} for line in infile: pattern, sentences = json.loads(line) base_patterns[pattern] = len(sentences) bp_example_ids = set() with open_file(patterns_file) as infile: with open_file(outfile, 'w') as o: for line in infile: pattern, content = json.loads(line) bp = content['base_patterns'] if len(bp) > n: bp = sorted(bp, key=lambda pattern: base_patterns[pattern], reverse=True)[:n] bp_set = set(bp) bp_with_examples = [] with open_file(base_patterns_file) as basefile: for baseline in basefile: bpattern, sentences = json.loads(baseline) if bpattern in bp_set: examples = [ json.loads(sentence) for sentence in sentences ] # I have added 1 to the id for example in examples: example[0] = example[0] - 1 bp_with_examples.append((bpattern, examples)) bp_example_ids.update( set([sentence[0] for sentence in examples])) ## stop sanning base pattern file if I have all base patterns bp_set.remove(bpattern) if not bp_set: break bp = bp_with_examples content['base_patterns'] = bp json.dump([pattern, content], o) o.write("\n") if example_ids is not None: with open_file(example_ids, 'w') as o: json.dump(list(bp_example_ids), o)
def decode_pattern_collection(ctx, infile, encoder, outfile, string, unknown): with open_file(encoder, 'rb') as encoder_file: pattern_encoder = Base64Encoder(PatternEncoder.load(encoder_file), binary=False) with open_file(infile) as infile: with open_file(outfile, 'w') as o: for line in infile: pattern, content = json.loads(line) decoded_pattern = pattern_encoder.decode(pattern) if string: out_pattern = str(decoded_pattern) else: out_pattern = base64.b64encode( pickle.dumps(decoded_pattern)).decode('ascii') base_patterns = content.get('base_patterns', []) decoded_base_patterns = [] for base_pattern in base_patterns: try: examples = [] if len(base_pattern) == 2: examples = base_pattern[1] base_pattern = base_pattern[0] decoded_pattern = pattern_encoder.decode(base_pattern) if unknown is None or all([ element != unknown for element in decoded_pattern.get_element_list() ]): if string: cout_pattern = str(decoded_pattern) else: cout_pattern = base64.b64encode( pickle.dumps(decoded_pattern)).decode( 'ascii') decoded_base_patterns.append( [cout_pattern, examples]) except: ctx.obj['logger'].warning( "Could not test pattern for unknown, skipping.") content['base_patterns'] = decoded_base_patterns json.dump((out_pattern, content), o) o.write("\n")
def corpus2sentences(ctx, infile, outdir, example_ids): try: os.mkdir(outdir) except OSError: print("Creation of the directory %s failed" % outdir) if example_ids is not None: example_ids = set(json.load(open_file(example_ids))) with open_file(infile) as corpusfile: for sent_id, sentence in enumerate(conllu.parse_incr(corpusfile)): print(sent_id) if example_ids is None or sent_id in example_ids: print(sentence.serialize(), file=open(os.path.join(outdir, str(sent_id)), 'w'))
def decode_patterns(ctx, infile, encoder, outfile, processes): with open_file(encoder, 'rb') as encoder_file: pattern_encoder = Base64Encoder(PatternEncoder.load(encoder_file), binary=False) with open_file(infile) as infile: with open_file(outfile, 'wb') as o: with MultiprocessMap(processes, chunksize=1000) as m: for pattern, decoded_pattern in m( functools.partial(decode_pattern, pattern_encoder=pattern_encoder), infile): ctx.obj['logger'].info("Pattern") pickle.dump((pattern, decoded_pattern), o)
def filter_patterns(ctx, patterns, stats, feature, threshold, outfile): keep = set() with open_file(stats) as infile: for line in infile: pattern, stats = json.loads(line) if stats.get(feature, 0) >= threshold: keep.add(pattern) with open_file(patterns) as infile: with open_file(outfile, 'w') as o: for line in infile: pattern, _ = json.loads(line) if pattern in keep: o.write(line)
def get_top_n(ctx, patterns_file, pattern_stats, stat, n, outfile): with open_file(pattern_stats) as infile: pattern_stats = {} for line in infile: pattern, stats = json.loads(line) pattern_stats[pattern] = stats patterns = sorted(pattern_stats.items(), key=lambda item: item[1][stat], reverse=True) patterns = { pattern[0]: [rank] for rank, pattern in enumerate(patterns[:n]) } with open_file(patterns_file) as infile: for line in infile: pattern, content = json.loads(line) if pattern in patterns: patterns[pattern].append(content) with open_file(outfile, 'w') as o: for pattern, value in sorted(patterns.items(), key=lambda item: item[1][0]): json.dump([ pattern, { 'stats': pattern_stats[pattern], 'base_patterns': value[1] } ], o) o.write("\n")
def add_pattern_stats(ctx, infile_patterns, outfile, known_stats, base_patterns, decoded_patterns, config, vocabulary_probs, pattern_profile_frequency): base_level = None if config is not None: base_level = open_json_config(config)['word_level'] if decoded_patterns is not None: with open_file(decoded_patterns, 'rb') as infile: decoded_patterns = {} while True: try: pattern, decoded_pattern = pickle.load(infile) decoded_patterns[pattern] = decoded_pattern except EOFError: break if base_patterns is not None: with open_file(base_patterns) as infile: base_patterns = {} for line in infile: pattern, sentences = json.loads(line) base_patterns[pattern] = len(sentences) if pattern_profile_frequency is not None: with open_file(pattern_profile_frequency, 'r') as infile: pattern_profile_frequency = json.load(infile) if vocabulary_probs is not None: with open_file(vocabulary_probs, 'r') as infile: vocabulary_probs = json.load(infile) if known_stats is not None: with open_file(known_stats) as infile: known_stats = {} for line in infile: pattern, stats = json.loads(line) known_stats[pattern] = stats number = 0 with open_file(infile_patterns) as infile: with open_file(outfile, 'w') as o: for pattern, stats in map( functools.partial( get_stats, decoded_patterns=decoded_patterns, known_stats=known_stats, base_patterns=base_patterns, base_level=base_level, pattern_profile_frequency=pattern_profile_frequency, vocabulary_probs=vocabulary_probs), infile): number += 1 ctx.obj['logger'].info("Pattern " + str(number)) json.dump((pattern, stats), o) o.write("\n")
def extract_patterns(ctx, infile, outfile_patterns, outfile_base, encoded_dictionaries, config, keep_only_word, keep_only_dict_words, skip_unknown, only_base): config = open_json_config(config) word_level = config["word_level"] phrase_tags = config["phrase_tags"] unknown_type = config["unknown"] try: encoded_dict = json.loads(encoded_dictionaries) except json.JSONDecodeError: with open_file(encoded_dictionaries) as dict_file: encoded_dict = json.load(dict_file) meta_symbols = encoded_dict.get("__special__", {}) token_start = meta_symbols.get("__TOKEN_START__", "__TOKEN_START__") token_end = meta_symbols.get("__TOKEN_END__", "__TOKEN_END__") unknown = {} for level in encoded_dict.keys(): if level != "__special__": unknown[level] = encoded_dict[level].get(unknown_type, unknown_type) if keep_only_dict_words: known = { level: set(vocab.values()) for level, vocab in encoded_dict.items() if isinstance(vocab, collections.abc.Mapping) } else: known = None if phrase_tags: phrase_tags = [ encoded_dict.get("upos", {}).get(element, element) for element in phrase_tags ] special_node_conversion = functools.partial(conversion_function, tags=phrase_tags) else: special_node_conversion = None if keep_only_word is not None: keep_only_word = encoded_dict["lemma"][keep_only_word] del encoded_dict extractor_config = config["extractor"] extractor_config['options'][ 'special_node_conversion'] = special_node_conversion extractor_config['options']['left_bracket'] = meta_symbols.get( extractor_config['options']['left_bracket']) extractor_config['options']['right_bracket'] = meta_symbols.get( extractor_config['options']['right_bracket']) extractor_config['options']['comma'] = meta_symbols.get( extractor_config['options']['comma']) extractor = factories.create_from_name('extractor', extractor_config) with open_file(infile) as infile: with open_file(outfile_patterns, 'w') as outfile_patterns: with open_file(outfile_base, 'w') as outfile_base: for sentence_patterns in map( functools.partial(pattern_extraction, extractor=extractor, word_level=word_level, token_start=token_start, token_end=token_end, keep_only_word=keep_only_word, logger=ctx.obj['logger'], skip_unknown=skip_unknown, unknowns=unknown, known=known), enumerate(conllu.parse_incr(infile))): for is_base_pattern, pattern, content in sentence_patterns: if not is_base_pattern: if not only_base: print("\t".join([pattern, str(content)]), file=outfile_patterns) else: print("\t".join([pattern, json.dumps(content)]), file=outfile_base)
def test_open_text_file(mockfunction): filename = 'test.txt' open_file(filename) mockfunction.assert_called_with(filename, 'r', encoding='utf-8')
def test_open_zipped_binary_file(mockfunction): filename = 'test.bin.gz' open_file(filename, 'rb') mockfunction.assert_called_with(filename, 'rb')