예제 #1
0
파일: cli.py 프로젝트: fab-bar/cxnMiner
def get_pattern_type_freq(ctx, infile_patterns, frequency_stats, outfile):

    pattern_types = collections.defaultdict(int)

    number = 0

    stats = {}

    with open_file(frequency_stats) as infile:

        for line in infile:
            pattern, pstats = json.loads(line)
            stats[pattern] = pstats

    with open_file(infile_patterns, 'rb') as infile:

        while True:
            try:
                pattern, decoded_pattern = pickle.load(infile)

                number += 1
                ctx.obj['logger'].info("Pattern " + str(number))
                pattern_type = decoded_pattern.get_pattern_profile()

                pattern_types[pattern_type] += stats.get(pattern, {}).get(
                    'frequency', 1)
            except EOFError:
                break

    print(len(pattern_types))
    with open_file(outfile, 'w') as o:
        json.dump(pattern_types, o)
예제 #2
0
파일: cli.py 프로젝트: fab-bar/cxnMiner
def get_vocabulary_probs(ctx, vocabulary, outfile, add_smoothing):

    try:
        vocabularies = json.loads(vocabulary)
    except json.JSONDecodeError:
        with open_file(vocabulary) as dict_file:
            vocabularies = json.load(dict_file)

    vocabularies_probs = {}

    for level in vocabularies:

        freq = 0
        entries = len(vocabularies[level])

        probs = {}

        for entry in vocabularies[level]:

            freq += vocabularies[level][entry]

        for entry in vocabularies[level]:

            probs[entry] = (vocabularies[level][entry] +
                            add_smoothing) / (freq + add_smoothing * entries)

        vocabularies_probs[level] = (probs, add_smoothing /
                                     (freq + add_smoothing * entries))

    with open_file(outfile, 'w') as o:
        json.dump(vocabularies_probs, o)
예제 #3
0
파일: cli.py 프로젝트: fab-bar/cxnMiner
def convert_pattern_list(ctx, infile, outfile, remove_hapax):
    def write_pattern(pattern, contents, outfile):
        json.dump((pattern, contents), outfile)
        outfile.write("\n")

    with open_file(infile) as infile:
        with open_file(outfile, 'w') as outfile:

            current_pattern = None
            contents = []

            for line in infile:
                pattern, content = line.rstrip().split("\t")

                if pattern != current_pattern:

                    if current_pattern is not None:
                        if not remove_hapax or len(contents) > 1:
                            write_pattern(current_pattern, contents, outfile)

                    current_pattern = pattern
                    contents = []

                contents.append(content)

            if not remove_hapax or len(contents) > 1:
                write_pattern(current_pattern, contents, outfile)
예제 #4
0
파일: cli.py 프로젝트: fab-bar/cxnMiner
def get_top_n_base_patterns(ctx, patterns_file, base_patterns_file, n, outfile,
                            example_ids):

    with open_file(base_patterns_file) as infile:

        base_patterns = {}

        for line in infile:
            pattern, sentences = json.loads(line)
            base_patterns[pattern] = len(sentences)

    bp_example_ids = set()
    with open_file(patterns_file) as infile:
        with open_file(outfile, 'w') as o:

            for line in infile:

                pattern, content = json.loads(line)
                bp = content['base_patterns']

                if len(bp) > n:
                    bp = sorted(bp,
                                key=lambda pattern: base_patterns[pattern],
                                reverse=True)[:n]

                bp_set = set(bp)
                bp_with_examples = []
                with open_file(base_patterns_file) as basefile:

                    for baseline in basefile:
                        bpattern, sentences = json.loads(baseline)
                        if bpattern in bp_set:
                            examples = [
                                json.loads(sentence) for sentence in sentences
                            ]
                            # I have added 1 to the id
                            for example in examples:
                                example[0] = example[0] - 1
                            bp_with_examples.append((bpattern, examples))
                            bp_example_ids.update(
                                set([sentence[0] for sentence in examples]))

                            ## stop sanning base pattern file if I have all base patterns
                            bp_set.remove(bpattern)
                            if not bp_set:
                                break

                bp = bp_with_examples

                content['base_patterns'] = bp

                json.dump([pattern, content], o)
                o.write("\n")

        if example_ids is not None:
            with open_file(example_ids, 'w') as o:
                json.dump(list(bp_example_ids), o)
예제 #5
0
파일: cli.py 프로젝트: fab-bar/cxnMiner
def decode_pattern_collection(ctx, infile, encoder, outfile, string, unknown):

    with open_file(encoder, 'rb') as encoder_file:
        pattern_encoder = Base64Encoder(PatternEncoder.load(encoder_file),
                                        binary=False)

    with open_file(infile) as infile:
        with open_file(outfile, 'w') as o:

            for line in infile:

                pattern, content = json.loads(line)
                decoded_pattern = pattern_encoder.decode(pattern)

                if string:
                    out_pattern = str(decoded_pattern)
                else:
                    out_pattern = base64.b64encode(
                        pickle.dumps(decoded_pattern)).decode('ascii')

                base_patterns = content.get('base_patterns', [])
                decoded_base_patterns = []
                for base_pattern in base_patterns:

                    try:
                        examples = []
                        if len(base_pattern) == 2:
                            examples = base_pattern[1]
                            base_pattern = base_pattern[0]

                        decoded_pattern = pattern_encoder.decode(base_pattern)

                        if unknown is None or all([
                                element != unknown for element in
                                decoded_pattern.get_element_list()
                        ]):
                            if string:
                                cout_pattern = str(decoded_pattern)
                            else:
                                cout_pattern = base64.b64encode(
                                    pickle.dumps(decoded_pattern)).decode(
                                        'ascii')

                            decoded_base_patterns.append(
                                [cout_pattern, examples])
                    except:
                        ctx.obj['logger'].warning(
                            "Could not test pattern for unknown, skipping.")

                content['base_patterns'] = decoded_base_patterns

                json.dump((out_pattern, content), o)
                o.write("\n")
예제 #6
0
파일: cli.py 프로젝트: fab-bar/cxnMiner
def corpus2sentences(ctx, infile, outdir, example_ids):

    try:
        os.mkdir(outdir)
    except OSError:
        print("Creation of the directory %s failed" % outdir)

    if example_ids is not None:
        example_ids = set(json.load(open_file(example_ids)))

    with open_file(infile) as corpusfile:

        for sent_id, sentence in enumerate(conllu.parse_incr(corpusfile)):
            print(sent_id)
            if example_ids is None or sent_id in example_ids:
                print(sentence.serialize(),
                      file=open(os.path.join(outdir, str(sent_id)), 'w'))
예제 #7
0
파일: cli.py 프로젝트: fab-bar/cxnMiner
def decode_patterns(ctx, infile, encoder, outfile, processes):

    with open_file(encoder, 'rb') as encoder_file:
        pattern_encoder = Base64Encoder(PatternEncoder.load(encoder_file),
                                        binary=False)

    with open_file(infile) as infile:
        with open_file(outfile, 'wb') as o:

            with MultiprocessMap(processes, chunksize=1000) as m:

                for pattern, decoded_pattern in m(
                        functools.partial(decode_pattern,
                                          pattern_encoder=pattern_encoder),
                        infile):

                    ctx.obj['logger'].info("Pattern")
                    pickle.dump((pattern, decoded_pattern), o)
예제 #8
0
파일: cli.py 프로젝트: fab-bar/cxnMiner
def filter_patterns(ctx, patterns, stats, feature, threshold, outfile):

    keep = set()
    with open_file(stats) as infile:
        for line in infile:
            pattern, stats = json.loads(line)
            if stats.get(feature, 0) >= threshold:
                keep.add(pattern)

    with open_file(patterns) as infile:
        with open_file(outfile, 'w') as o:

            for line in infile:

                pattern, _ = json.loads(line)

                if pattern in keep:
                    o.write(line)
예제 #9
0
파일: cli.py 프로젝트: fab-bar/cxnMiner
def get_top_n(ctx, patterns_file, pattern_stats, stat, n, outfile):

    with open_file(pattern_stats) as infile:

        pattern_stats = {}

        for line in infile:
            pattern, stats = json.loads(line)
            pattern_stats[pattern] = stats

    patterns = sorted(pattern_stats.items(),
                      key=lambda item: item[1][stat],
                      reverse=True)
    patterns = {
        pattern[0]: [rank]
        for rank, pattern in enumerate(patterns[:n])
    }

    with open_file(patterns_file) as infile:

        for line in infile:

            pattern, content = json.loads(line)

            if pattern in patterns:
                patterns[pattern].append(content)

    with open_file(outfile, 'w') as o:
        for pattern, value in sorted(patterns.items(),
                                     key=lambda item: item[1][0]):
            json.dump([
                pattern, {
                    'stats': pattern_stats[pattern],
                    'base_patterns': value[1]
                }
            ], o)
            o.write("\n")
예제 #10
0
파일: cli.py 프로젝트: fab-bar/cxnMiner
def add_pattern_stats(ctx, infile_patterns, outfile, known_stats,
                      base_patterns, decoded_patterns, config,
                      vocabulary_probs, pattern_profile_frequency):

    base_level = None
    if config is not None:
        base_level = open_json_config(config)['word_level']

    if decoded_patterns is not None:
        with open_file(decoded_patterns, 'rb') as infile:

            decoded_patterns = {}

            while True:
                try:
                    pattern, decoded_pattern = pickle.load(infile)
                    decoded_patterns[pattern] = decoded_pattern
                except EOFError:
                    break

    if base_patterns is not None:
        with open_file(base_patterns) as infile:

            base_patterns = {}

            for line in infile:
                pattern, sentences = json.loads(line)
                base_patterns[pattern] = len(sentences)

    if pattern_profile_frequency is not None:
        with open_file(pattern_profile_frequency, 'r') as infile:
            pattern_profile_frequency = json.load(infile)

    if vocabulary_probs is not None:
        with open_file(vocabulary_probs, 'r') as infile:
            vocabulary_probs = json.load(infile)

    if known_stats is not None:

        with open_file(known_stats) as infile:

            known_stats = {}

            for line in infile:
                pattern, stats = json.loads(line)
                known_stats[pattern] = stats

    number = 0
    with open_file(infile_patterns) as infile:
        with open_file(outfile, 'w') as o:

            for pattern, stats in map(
                    functools.partial(
                        get_stats,
                        decoded_patterns=decoded_patterns,
                        known_stats=known_stats,
                        base_patterns=base_patterns,
                        base_level=base_level,
                        pattern_profile_frequency=pattern_profile_frequency,
                        vocabulary_probs=vocabulary_probs), infile):

                number += 1
                ctx.obj['logger'].info("Pattern " + str(number))

                json.dump((pattern, stats), o)
                o.write("\n")
예제 #11
0
파일: cli.py 프로젝트: fab-bar/cxnMiner
def extract_patterns(ctx, infile, outfile_patterns, outfile_base,
                     encoded_dictionaries, config, keep_only_word,
                     keep_only_dict_words, skip_unknown, only_base):

    config = open_json_config(config)
    word_level = config["word_level"]
    phrase_tags = config["phrase_tags"]
    unknown_type = config["unknown"]

    try:
        encoded_dict = json.loads(encoded_dictionaries)
    except json.JSONDecodeError:
        with open_file(encoded_dictionaries) as dict_file:
            encoded_dict = json.load(dict_file)

    meta_symbols = encoded_dict.get("__special__", {})
    token_start = meta_symbols.get("__TOKEN_START__", "__TOKEN_START__")
    token_end = meta_symbols.get("__TOKEN_END__", "__TOKEN_END__")

    unknown = {}
    for level in encoded_dict.keys():
        if level != "__special__":
            unknown[level] = encoded_dict[level].get(unknown_type,
                                                     unknown_type)

    if keep_only_dict_words:
        known = {
            level: set(vocab.values())
            for level, vocab in encoded_dict.items()
            if isinstance(vocab, collections.abc.Mapping)
        }
    else:
        known = None

    if phrase_tags:
        phrase_tags = [
            encoded_dict.get("upos", {}).get(element, element)
            for element in phrase_tags
        ]
        special_node_conversion = functools.partial(conversion_function,
                                                    tags=phrase_tags)
    else:
        special_node_conversion = None

    if keep_only_word is not None:
        keep_only_word = encoded_dict["lemma"][keep_only_word]

    del encoded_dict

    extractor_config = config["extractor"]
    extractor_config['options'][
        'special_node_conversion'] = special_node_conversion
    extractor_config['options']['left_bracket'] = meta_symbols.get(
        extractor_config['options']['left_bracket'])
    extractor_config['options']['right_bracket'] = meta_symbols.get(
        extractor_config['options']['right_bracket'])
    extractor_config['options']['comma'] = meta_symbols.get(
        extractor_config['options']['comma'])

    extractor = factories.create_from_name('extractor', extractor_config)

    with open_file(infile) as infile:
        with open_file(outfile_patterns, 'w') as outfile_patterns:
            with open_file(outfile_base, 'w') as outfile_base:

                for sentence_patterns in map(
                        functools.partial(pattern_extraction,
                                          extractor=extractor,
                                          word_level=word_level,
                                          token_start=token_start,
                                          token_end=token_end,
                                          keep_only_word=keep_only_word,
                                          logger=ctx.obj['logger'],
                                          skip_unknown=skip_unknown,
                                          unknowns=unknown,
                                          known=known),
                        enumerate(conllu.parse_incr(infile))):

                    for is_base_pattern, pattern, content in sentence_patterns:
                        if not is_base_pattern:
                            if not only_base:
                                print("\t".join([pattern,
                                                 str(content)]),
                                      file=outfile_patterns)
                        else:
                            print("\t".join([pattern,
                                             json.dumps(content)]),
                                  file=outfile_base)
예제 #12
0
def test_open_text_file(mockfunction):

    filename = 'test.txt'

    open_file(filename)
    mockfunction.assert_called_with(filename, 'r', encoding='utf-8')
예제 #13
0
def test_open_zipped_binary_file(mockfunction):

    filename = 'test.bin.gz'

    open_file(filename, 'rb')
    mockfunction.assert_called_with(filename, 'rb')