Пример #1
0
def remove_duplicates(extract_or_regexify, outpath=None, encoding='utf8', **kwargs):
    _outpath = get_output_path(extract_or_regexify, outpath, exts=('clean',))
    outpath = f'{_outpath}.tsv'
    with open(extract_or_regexify, encoding=encoding) as fh:
        text = fh.read()
    # # both files have same format
    # if 'regexify' in extract_or_regexify:
    #     extract_file = False
    # elif 'extract' in extract_or_regexify:
    #     extract_file = True
    # elif re.search(r'\[W\d+<', text):  # is extract file
    #     extract_file = True
    # elif re.search(r'\\w\+', text):  # is regexify file
    #     extract_file = False
    # else:
    #     raise ValueError('Unrecognized file type: expected extract or regexify')

    existing_terms = {}
    for line in text.split('\n'):
        concept, name, term = line.split('\t')
        if term in existing_terms:
            c, n, t = existing_terms[term]
            logger.warning(f'Found duplicate term "{term}"')
            if concept != c:
                logger.warning(f'Concept differs: {concept} ({name}) vs {c} ({n})')
            if len(name) < len(n):  # keep the shortest/simplest spelling
                existing_terms[term] = (concept, name, term)
        else:
            existing_terms[term] = (concept, name, term)

    with open(outpath, 'w', encoding=encoding) as out:
        for c, n, t in existing_terms.values():
            out.write(f'{c}\t{n}\t{t}\n')
Пример #2
0
def regexify_keywords_to_file(extract,
                              outpath=None,
                              encoding='utf8',
                              extra_slop=1,
                              **kwargs):
    _outpath = get_output_path(extract, outpath, exts=('regexify', ))
    outpath = f'{_outpath}.tsv'
    code_pat = re.compile(r'\d+\.\d+')
    num_pat = re.compile(r'\d+')
    slop_pat = re.compile(r'\[W(?P<slop>\d+)<\|(?P<punct>.*?)\|>\]')
    regexes = []
    with open(extract, encoding=encoding) as fh:
        for line in fh:
            concept, keywords, term = line.strip().split('\t')
            words = []
            prev_word = False
            terms = term.split(' ')
            for word in terms:
                if code_pat.match(word):
                    code = word.replace('.', r'\.')
                    regexes.append((concept, keywords, fr'\b{code}\b'))
                    prev_word = False
                elif num_pat.match(word):
                    if prev_word:
                        words.append(r'\W*')
                    if len(terms) > 1:
                        words.append(r'\d+')
                    else:
                        words.append(word)
                    prev_word = True
                elif slop_pat.match(word):
                    if prev_word:
                        m = slop_pat.match(word)
                        cnt = int(m.group('slop')) + extra_slop
                        if '.' in m.group('punct') or ';' in m.group('punct'):
                            words.append(rf'\W*(\w+\W*){{0,{cnt}}}')
                        else:
                            words.append(
                                rf'[^\w\.;]*(\w+[^\w\.;]*){{0,{cnt}}}')
                    prev_word = False
                else:  # is word
                    if prev_word:
                        words.append(r'\W*')
                    if len(word) > 4:
                        words.append(Stemmer.transform(word))
                    else:
                        words.append(fr'\b{word}\b')
                    prev_word = True
            regexes.append((concept, keywords, ''.join(words)))

    with open(outpath, 'w', encoding=encoding) as out:
        out.write('\n'.join('\t'.join(line) for line in regexes))
Пример #3
0
def apply_regex_to_corpus(regex,
                          outpath=None,
                          encoding='utf8',
                          run_hours=None,
                          exclude_captured=False,
                          log_incr=10000,
                          newline_replace=' ',
                          **kwargs):
    """

    :param regex:
    :param outpath:
    :param encoding:
    :param run_hours:
    :param exclude_captured:
    :param log_incr: number of records to run before reporting how long this many
        documents took to run
    :param kwargs:
    :return:
    """
    _outpath = get_output_path(regex, outpath, exts=('apply', ))
    start_time = datetime.datetime.now()
    dt = start_time.strftime('%Y%m%d_%H%M%S')
    outpath = f'{_outpath}.{dt}.tsv'
    logger.info(f'Primary output file: {outpath}')
    regexes = compile_regexes(regex, encoding)
    logger.info(f'Compiled {len(regexes)} regexes.')
    rx_cnt = 0
    logger.info('Loading files.')
    with open(outpath, 'w', encoding=encoding) as out:
        out.write('document\tconcept\tterm\tcaptured\n')
        for i, (name, doc) in enumerate(get_documents(**kwargs)):
            for concept, term, regex in regexes:
                for m in regex.finditer(doc):
                    rx_cnt += 1
                    capture = '' if exclude_captured else m.group()
                    capture = capture.replace('\n', newline_replace)
                    out.write(f'{name}\t{concept}\t{term}\t{capture}\n')
            if i % log_incr == 0:
                logger.info(
                    f'Completed {i + 1} documents ({rx_cnt} concepts identified)'
                )
                if check_time_expired(start_time, run_hours):
                    logger.warning(f'Time expired.')
    logger.info(
        f'Process completed: {i + 1} documents in {datetime.datetime.now() - start_time}'
    )
Пример #4
0
def merge_extracts(*extracts,
                   outpath=None,
                   encoding='utf8',
                   ignore_duplicates=True,
                   **kwargs):
    if not extracts:
        extracts = kwargs['extracts']
    _outpath = get_output_path(extracts[0],
                               outpath,
                               exts=('extract.combined', ))
    outpath = f'{_outpath}.tsv'
    keyword_to_concept = {}
    concept_to_term = defaultdict(lambda: defaultdict(str))
    existing_terms = set()
    for i, extract in enumerate(extracts):
        name = os.path.basename(extract)
        with open(extract, encoding=encoding) as fh:
            for line in fh:
                concept, keywords, term = line.strip().split('\t')
                if keywords in keyword_to_concept:
                    if keyword_to_concept[keywords] != concept:
                        logger.warning(
                            f'Ignoring disagreement: "{name}" (extract #{i + 1}) classifies'
                            f' "{keywords}" in "{concept}", expected: "{keyword_to_concept[keywords]}"'
                        )
                else:
                    keyword_to_concept[keywords] = concept

                if keywords in concept_to_term[concept]:
                    orig_term = concept_to_term[concept][keywords]
                    term = merge_terms(orig_term, term)
                if ignore_duplicates and term in existing_terms:
                    pass
                else:
                    concept_to_term[concept][keywords] = term
                    existing_terms.add(term)

    with open(outpath, 'w', encoding=encoding) as out:
        for concept in concept_to_term:
            for keywordstr, term in concept_to_term[concept].items():
                out.write(f'{concept}\t{keywordstr}\t{term}\n')
Пример #5
0
def build_frequency_file(bratdb,
                         *,
                         outpath=None,
                         title='Term Frequency',
                         **kwargs):
    outpath = get_output_path(bratdb,
                              outpath,
                              exts=('freq', 'rst' if PYSCRIVEN else '.txt'))
    freqs = get_frequency(bratdb, **kwargs)
    if not PYSCRIVEN:
        return build_simple_freq_file(freqs, outpath)

    rst_list = [('heading', title)]
    for label, datum in tabulate_dict_counter(freqs,
                                              fillvalue='-',
                                              as_items=True):
        rst_list.append(('heading', label, {'level': 2}))
        rst_list.append(('table',
                         Table(header=('Annotation', 'Term', 'Frequency'),
                               data=datum)))
    with RestWriter(fp=outpath) as out:
        out.write_all(rst_list)
Пример #6
0
def test_valid_output():
    target = r'C:\test\example.pkl'
    outpath = expected = r'D:\test\this.txt'
    actual = get_output_path(target, outpath=outpath)
    assert expected == actual
Пример #7
0
def test_exts(target, expected, exts):
    actual = get_output_path(target, exts=exts)
    assert expected == actual
Пример #8
0
def test_default(target, expected):
    actual = get_output_path(target)
    assert expected == actual
Пример #9
0
def build_bratdb_info_file(bratdb, *, outpath=None, **kwargs):
    outpath = get_output_path(bratdb, outpath, exts=('info', 'txt'))
    data = get_brat_info(bratdb, **kwargs)
    with open(outpath, 'w') as out:
        for key, value in data:
            out.write(f'{key:.<30}.{value}\n')
Пример #10
0
def extract_keywords_to_file(bratdb,
                             *,
                             outpath=None,
                             sep='\t',
                             one_label_per_term=True,
                             encoding='utf8',
                             **kwargs):
    _outpath = get_output_path(bratdb, outpath, exts=('extract', ))
    outpath = f'{_outpath}.tsv'
    freq_path = f'{_outpath}.freq.tsv'
    info_path = f'{_outpath}.info'
    dupe_path = f'{_outpath}.dupes'
    hapax_add_path = f'{_outpath}.add.hapax'
    hapax_omit_path = f'{_outpath}.omit.hapax'
    data, dupe_dict = get_keywords(bratdb, **kwargs)

    keyword_to_concept = {}  # store only most frequent label with each concept
    with open(dupe_path, 'w', encoding=encoding) as out:
        out.write('keyword\tconcepts\n')
        for keyword, concepts in dupe_dict.items():
            mc = Counter(concepts).most_common()
            concepts = (f'{k} ({v})' for k, v in mc)
            out.write(f'{keyword}\t{", ".join(concepts)}\n')
            if one_label_per_term:
                keyword_to_concept[keyword] = mc[0][0]

    terms = defaultdict(set)
    hapax_added = set()
    hapax_ignored = set()
    with open(freq_path, 'w', encoding=encoding) as out:
        out.write('concept\tterm\tfreq\n')
        for concept, keywordstr, freq in data.term_frequencies:
            # only keep majority term
            if not keyword_to_concept or keyword_to_concept.get(
                    keywordstr, concept) == concept:
                out.write(f'{concept}{sep}{keywordstr}{sep}{freq}\n')

                if freq == 1:  # handle hapax legonoma
                    if data.get_freq(keywordstr) > 1:  # otherwise exists
                        terms[concept].add(keywordstr)
                    else:  # only retain known keywords
                        new_keyword = [
                            kw for kw in data.get_term_keywords(keywordstr)
                            if data.get_keyword_freq(kw) >= 2
                        ]
                        new_keyword_str = ' '.join(str(w) for w in new_keyword)
                        if data.get_freq(new_keyword_str) <= 1 and len(
                                new_keyword) > 1:
                            term = data.get_term(keywordstr)
                            new_stopwords = {
                                str(w)
                                for w in data.get_term_keywords(keywordstr)
                                if data.get_keyword_freq(w) < 2
                            }
                            new_term = Term(term._orig_term,
                                            add_stopwords=new_stopwords)
                            terms[concept].add(new_keyword_str)
                            data.add(new_term, concept)
                            data.update()
                            hapax_added.add(new_keyword_str)
                        else:
                            hapax_ignored.add(keywordstr)
                else:
                    terms[concept].add(keywordstr)

    with open(hapax_add_path, 'w', encoding=encoding) as out:
        out.write('\n'.join(hapax_added))
    with open(hapax_omit_path, 'w', encoding=encoding) as out:
        out.write('\n'.join(hapax_ignored))
    with open(outpath, 'w', encoding=encoding) as out:
        for concept in terms:
            for keywordstr in terms[concept]:
                out.write(
                    f'{concept}\t{keywordstr}\t{data.get_term(keywordstr).segmentstr}\n'
                )