def remove_duplicates(extract_or_regexify, outpath=None, encoding='utf8', **kwargs): _outpath = get_output_path(extract_or_regexify, outpath, exts=('clean',)) outpath = f'{_outpath}.tsv' with open(extract_or_regexify, encoding=encoding) as fh: text = fh.read() # # both files have same format # if 'regexify' in extract_or_regexify: # extract_file = False # elif 'extract' in extract_or_regexify: # extract_file = True # elif re.search(r'\[W\d+<', text): # is extract file # extract_file = True # elif re.search(r'\\w\+', text): # is regexify file # extract_file = False # else: # raise ValueError('Unrecognized file type: expected extract or regexify') existing_terms = {} for line in text.split('\n'): concept, name, term = line.split('\t') if term in existing_terms: c, n, t = existing_terms[term] logger.warning(f'Found duplicate term "{term}"') if concept != c: logger.warning(f'Concept differs: {concept} ({name}) vs {c} ({n})') if len(name) < len(n): # keep the shortest/simplest spelling existing_terms[term] = (concept, name, term) else: existing_terms[term] = (concept, name, term) with open(outpath, 'w', encoding=encoding) as out: for c, n, t in existing_terms.values(): out.write(f'{c}\t{n}\t{t}\n')
def regexify_keywords_to_file(extract, outpath=None, encoding='utf8', extra_slop=1, **kwargs): _outpath = get_output_path(extract, outpath, exts=('regexify', )) outpath = f'{_outpath}.tsv' code_pat = re.compile(r'\d+\.\d+') num_pat = re.compile(r'\d+') slop_pat = re.compile(r'\[W(?P<slop>\d+)<\|(?P<punct>.*?)\|>\]') regexes = [] with open(extract, encoding=encoding) as fh: for line in fh: concept, keywords, term = line.strip().split('\t') words = [] prev_word = False terms = term.split(' ') for word in terms: if code_pat.match(word): code = word.replace('.', r'\.') regexes.append((concept, keywords, fr'\b{code}\b')) prev_word = False elif num_pat.match(word): if prev_word: words.append(r'\W*') if len(terms) > 1: words.append(r'\d+') else: words.append(word) prev_word = True elif slop_pat.match(word): if prev_word: m = slop_pat.match(word) cnt = int(m.group('slop')) + extra_slop if '.' in m.group('punct') or ';' in m.group('punct'): words.append(rf'\W*(\w+\W*){{0,{cnt}}}') else: words.append( rf'[^\w\.;]*(\w+[^\w\.;]*){{0,{cnt}}}') prev_word = False else: # is word if prev_word: words.append(r'\W*') if len(word) > 4: words.append(Stemmer.transform(word)) else: words.append(fr'\b{word}\b') prev_word = True regexes.append((concept, keywords, ''.join(words))) with open(outpath, 'w', encoding=encoding) as out: out.write('\n'.join('\t'.join(line) for line in regexes))
def apply_regex_to_corpus(regex, outpath=None, encoding='utf8', run_hours=None, exclude_captured=False, log_incr=10000, newline_replace=' ', **kwargs): """ :param regex: :param outpath: :param encoding: :param run_hours: :param exclude_captured: :param log_incr: number of records to run before reporting how long this many documents took to run :param kwargs: :return: """ _outpath = get_output_path(regex, outpath, exts=('apply', )) start_time = datetime.datetime.now() dt = start_time.strftime('%Y%m%d_%H%M%S') outpath = f'{_outpath}.{dt}.tsv' logger.info(f'Primary output file: {outpath}') regexes = compile_regexes(regex, encoding) logger.info(f'Compiled {len(regexes)} regexes.') rx_cnt = 0 logger.info('Loading files.') with open(outpath, 'w', encoding=encoding) as out: out.write('document\tconcept\tterm\tcaptured\n') for i, (name, doc) in enumerate(get_documents(**kwargs)): for concept, term, regex in regexes: for m in regex.finditer(doc): rx_cnt += 1 capture = '' if exclude_captured else m.group() capture = capture.replace('\n', newline_replace) out.write(f'{name}\t{concept}\t{term}\t{capture}\n') if i % log_incr == 0: logger.info( f'Completed {i + 1} documents ({rx_cnt} concepts identified)' ) if check_time_expired(start_time, run_hours): logger.warning(f'Time expired.') logger.info( f'Process completed: {i + 1} documents in {datetime.datetime.now() - start_time}' )
def merge_extracts(*extracts, outpath=None, encoding='utf8', ignore_duplicates=True, **kwargs): if not extracts: extracts = kwargs['extracts'] _outpath = get_output_path(extracts[0], outpath, exts=('extract.combined', )) outpath = f'{_outpath}.tsv' keyword_to_concept = {} concept_to_term = defaultdict(lambda: defaultdict(str)) existing_terms = set() for i, extract in enumerate(extracts): name = os.path.basename(extract) with open(extract, encoding=encoding) as fh: for line in fh: concept, keywords, term = line.strip().split('\t') if keywords in keyword_to_concept: if keyword_to_concept[keywords] != concept: logger.warning( f'Ignoring disagreement: "{name}" (extract #{i + 1}) classifies' f' "{keywords}" in "{concept}", expected: "{keyword_to_concept[keywords]}"' ) else: keyword_to_concept[keywords] = concept if keywords in concept_to_term[concept]: orig_term = concept_to_term[concept][keywords] term = merge_terms(orig_term, term) if ignore_duplicates and term in existing_terms: pass else: concept_to_term[concept][keywords] = term existing_terms.add(term) with open(outpath, 'w', encoding=encoding) as out: for concept in concept_to_term: for keywordstr, term in concept_to_term[concept].items(): out.write(f'{concept}\t{keywordstr}\t{term}\n')
def build_frequency_file(bratdb, *, outpath=None, title='Term Frequency', **kwargs): outpath = get_output_path(bratdb, outpath, exts=('freq', 'rst' if PYSCRIVEN else '.txt')) freqs = get_frequency(bratdb, **kwargs) if not PYSCRIVEN: return build_simple_freq_file(freqs, outpath) rst_list = [('heading', title)] for label, datum in tabulate_dict_counter(freqs, fillvalue='-', as_items=True): rst_list.append(('heading', label, {'level': 2})) rst_list.append(('table', Table(header=('Annotation', 'Term', 'Frequency'), data=datum))) with RestWriter(fp=outpath) as out: out.write_all(rst_list)
def test_valid_output(): target = r'C:\test\example.pkl' outpath = expected = r'D:\test\this.txt' actual = get_output_path(target, outpath=outpath) assert expected == actual
def test_exts(target, expected, exts): actual = get_output_path(target, exts=exts) assert expected == actual
def test_default(target, expected): actual = get_output_path(target) assert expected == actual
def build_bratdb_info_file(bratdb, *, outpath=None, **kwargs): outpath = get_output_path(bratdb, outpath, exts=('info', 'txt')) data = get_brat_info(bratdb, **kwargs) with open(outpath, 'w') as out: for key, value in data: out.write(f'{key:.<30}.{value}\n')
def extract_keywords_to_file(bratdb, *, outpath=None, sep='\t', one_label_per_term=True, encoding='utf8', **kwargs): _outpath = get_output_path(bratdb, outpath, exts=('extract', )) outpath = f'{_outpath}.tsv' freq_path = f'{_outpath}.freq.tsv' info_path = f'{_outpath}.info' dupe_path = f'{_outpath}.dupes' hapax_add_path = f'{_outpath}.add.hapax' hapax_omit_path = f'{_outpath}.omit.hapax' data, dupe_dict = get_keywords(bratdb, **kwargs) keyword_to_concept = {} # store only most frequent label with each concept with open(dupe_path, 'w', encoding=encoding) as out: out.write('keyword\tconcepts\n') for keyword, concepts in dupe_dict.items(): mc = Counter(concepts).most_common() concepts = (f'{k} ({v})' for k, v in mc) out.write(f'{keyword}\t{", ".join(concepts)}\n') if one_label_per_term: keyword_to_concept[keyword] = mc[0][0] terms = defaultdict(set) hapax_added = set() hapax_ignored = set() with open(freq_path, 'w', encoding=encoding) as out: out.write('concept\tterm\tfreq\n') for concept, keywordstr, freq in data.term_frequencies: # only keep majority term if not keyword_to_concept or keyword_to_concept.get( keywordstr, concept) == concept: out.write(f'{concept}{sep}{keywordstr}{sep}{freq}\n') if freq == 1: # handle hapax legonoma if data.get_freq(keywordstr) > 1: # otherwise exists terms[concept].add(keywordstr) else: # only retain known keywords new_keyword = [ kw for kw in data.get_term_keywords(keywordstr) if data.get_keyword_freq(kw) >= 2 ] new_keyword_str = ' '.join(str(w) for w in new_keyword) if data.get_freq(new_keyword_str) <= 1 and len( new_keyword) > 1: term = data.get_term(keywordstr) new_stopwords = { str(w) for w in data.get_term_keywords(keywordstr) if data.get_keyword_freq(w) < 2 } new_term = Term(term._orig_term, add_stopwords=new_stopwords) terms[concept].add(new_keyword_str) data.add(new_term, concept) data.update() hapax_added.add(new_keyword_str) else: hapax_ignored.add(keywordstr) else: terms[concept].add(keywordstr) with open(hapax_add_path, 'w', encoding=encoding) as out: out.write('\n'.join(hapax_added)) with open(hapax_omit_path, 'w', encoding=encoding) as out: out.write('\n'.join(hapax_ignored)) with open(outpath, 'w', encoding=encoding) as out: for concept in terms: for keywordstr in terms[concept]: out.write( f'{concept}\t{keywordstr}\t{data.get_term(keywordstr).segmentstr}\n' )