def fill_gaps(**kwargs): """ Try To find ngrams for missing lemmas by scraping data from the Google Ngram Viewer site """ letters = kwargs.get('letters', string.ascii_lowercase) # Load list of gaps from file infile = os.path.join(GBN_DIR, '4', 'tmp', filename(GBN_DIR)) with open(infile, 'r') as filehandle: gaps = [l.strip() for l in filehandle.readlines()] gaps = [g for g in gaps if lexical_sort(g) and lexical_sort(g)[0] in letters] results = {letter: [] for letter in letters} gba = GoogleBooksApi(start=1750, end=2008) # We cluster ngrams into sets of five, which will be dealt with in # a single request - cutting down the number of requests clusters = _cluster(gaps, 5) for ngram_set in clusters: print(ngram_set[0]) for result in gba.get_ngram_data(queries=ngram_set): results[result.initial()].append(result) #print(result.tostring()) sleep(SLEEPTIME) for letter in results: subdir = os.path.join(GBN_DIR, '4', letter) if not os.path.exists(subdir): os.mkdir(subdir) with open(os.path.join(subdir, filename(GBN_DIR)), 'w') as filehandle: for r in results[letter]: filehandle.write(r.tostring() + '\n')
def matches_headword(self, lemma, exact=False): """ Return True if the lemma matches the entry headword (which would indicate that this is a regular sense and not a subentry) """ if exact and lemma == self.headword: return True elif not exact and lexical_sort(lemma) == lexical_sort(self.headword): return True else: return False
def _compound_reverse_match(lemma, tokens, ngrams): match = None words = lemma.split() if len(words) == 2: reverse1 = words[1] + " " + words[0] reverse2 = words[1] + "s " + words[0] reverse1 = stringtools.lexical_sort(reverse1) reverse2 = stringtools.lexical_sort(reverse2) for token_full, token_flat in tokens + ngrams: if token_flat == reverse1 or token_flat == reverse2: match = token_full break return match
def parse_link_file(self): def parse_hw(node): text = etree.tostring(node, method='text', encoding='unicode') text = text.split('|')[0] return text.split(',')[0].strip() # Create mappings from OED to ODE multilinks = defaultdict(list) for filepath in self.link_files: tree = etree.parse(filepath) for entry in tree.findall('./e'): lexid = entry.get('lexid') linknode = entry.find('./linkSet/link') if linknode is not None: oed_id = linknode.get('refentry') sub_id = linknode.get('refid') if sub_id is not None and sub_id != '0': oed_id = oed_id + '#' + sub_id oed_hw = parse_hw(linknode) ode_hw = parse_hw(entry.find('label')) multilinks[oed_id].append(CandidateLink(lexid, oed_hw, ode_hw)) for oed_id, linklist in multilinks.items(): # If there's only one possible ODO link for this OED ID, we accept that. # But if there's more than one competing link, we look for the one where # the headwords match; or failing that, the one where the headwords # fuzzily match. if len(linklist) == 1: winner = linklist[0] else: # Exact match z = [l for l in linklist if l.oed_headword == l.odo_headword] try: winner = z[0] except IndexError: # Fuzzy match z = [l for l in linklist if lexical_sort(l.oed_headword) == lexical_sort(l.odo_headword)] try: winner = z[0] except IndexError: # Give up winner = linklist[0] self.links[oed_id] = winner.lexid # Create the inverse mapping (from ODE to OED) for oed_id, lexid in self.links.items(): self.links_reversed[lexid] = oed_id self.parse_oed_file()
def _tokenize_text(text): naive_tokens = text.split() tokens = [t.strip(',:;()[]."?! ') for t in naive_tokens] tokens = [re.sub(r"'s$", "", t) for t in tokens] tokens = [(t, stringtools.lexical_sort(t)) for t in tokens] ngrams = _compile_ngrams(naive_tokens, 2) + _compile_ngrams(naive_tokens, 3) + _compile_ngrams(naive_tokens, 4) return tokens, ngrams
def _parse_line(line, gram_count): line = line.strip() parts = line.split("\t") decades = {} while parts and DECADE_PATTERN.search(parts[-1]): p = parts.pop() decade, score = p.split(":") decades[int(decade)] = int(score) if len(parts) == 3: sortcode = parts[0] source_lemma = parts[1] wordclass = parts[2] elif len(parts) == 1: sortcode = None source_lemma = parts[0] wordclass = "ALL" elif len(parts) == 2 and gram_count != 3: sortcode = None source_lemma = parts[0] wordclass = parts[1] elif len(parts) == 2: sortcode = parts[0] source_lemma = parts[1] wordclass = "ALL" if gram_count >= 3: source_lemma = source_lemma.replace(" - ", "-") if not sortcode: sortcode = lexical_sort(source_lemma) return [line, source_lemma, source_lemma, sortcode, decades, gram_count, wordclass, None]
def collect_sample(self, name, size, function): total = 0 for parent_dir in self.directories: dir = os.path.join(parent_dir, 'classified') for letter in letters: pl = PickleLoader(dir, letters=letter) for sense in pl.iterate(): if is_valid(sense, name, function): total += 1 sense_index = set() while len(sense_index) < size: i = random.randint(0, total) if not i in sense_index: sense_index.add(i) self.sample = [] count = 0 for parent_dir in self.directories: dir = os.path.join(parent_dir, 'classified') for letter in letters: pl = PickleLoader(dir, letters=letter) for sense in pl.iterate(): if is_valid(sense, name, function): if count in sense_index: self.sample.append(sense) count += 1 self.sample.sort(key=lambda s: lexical_sort(s.lemma))
def index_proper_names(self): allnames = set() for name_type in ('firstname', 'surname', 'placename'): for name in propernames.names_list(name_type): if ' ' in name: continue allnames.add(name) for letter in string.ascii_lowercase: print('Indexing proper names in %s...' % letter) for entry in entry_iterator(letters=letter): if entry.primary_wordclass() not in ('NP', 'NPS'): continue for typeunit in entry.types(): if (' ' in typeunit.form or not typeunit.lemma_manager().capitalization_type() == 'capitalized'): continue allnames.add(typeunit.form) out_file = os.path.join(FORM_INDEX_DIR, 'proper_names', 'all.txt') with open(out_file, 'w') as filehandle: for name in allnames: sortable = stringtools.lexical_sort(name) if (not sortable or len(sortable) > MAX_WORDLENGTH or len(name) > MAX_WORDLENGTH): continue filehandle.write('%s\t%s\t%s\n' % (sortable, name, str(propernames.is_common(name))))
def _load_cache(self): for letter in LETTERS: fname = os.path.join(self.dir, letter + ".xml") doc = etree.parse(fname, PARSER) for entry in doc.findall("e"): blocks = _parse_entry(entry, self.with_definitions, self.max_senses) for block in blocks: address = (block.entry_id, block.block_id) MainSensesCache.blocks[address] = block # Index all the blocks by entry ID for block in MainSensesCache.blocks.values(): try: MainSensesCache.entries[block.entry_id] except KeyError: MainSensesCache.entries[block.entry_id] = [] MainSensesCache.entries[block.entry_id].append(block) # Identify minor homographs homographs = defaultdict(list) for block in MainSensesCache.blocks.values(): address = (lexical_sort(block.headword), block.wordclass) homographs[address].append(block) for homograph_set in homographs.values(): if len(homograph_set) > 1: homograph_set.sort(key=lambda b: b.quotations, reverse=True) for h in homograph_set[1:]: MainSensesCache.minor_homographs[h.entry_id].add(h.block_id) MainSensesCache.minor_homographs[h.entry_id].add(h.wordclass)
def _store_forms(block, entry, block_type, letter): us_variant = entry.us_variant() standardtypes = set() varianttypes = set() alientypes = set() for morphset in block.morphsets(): if (morphset.form in (entry.lemma, us_variant, block.lemma) or morphset.is_oed_headword()): _add_types(morphset, standardtypes, letter) elif (block_type == 'entry' and morphset.date().end > VARIANT_MINIMUM_END_DATE and not morphset.is_nonstandard()): # Don't store variants for subentries; don't store # very old or non-standard variants _add_types(morphset, varianttypes, letter) _add_alien_variants(morphset, alientypes, letter) varianttypes = varianttypes - standardtypes alientypes = alientypes - standardtypes refentry, refid = block.link(target='oed', asTuple=True) if block.has_frequency_table(): f2000 = block.frequency_table().frequency(period='1990-2007') f1950 = block.frequency_table().frequency(period='1940-1959') f1900 = block.frequency_table().frequency(period='1890-1909') f1850 = block.frequency_table().frequency(period='1840-1859') f1800 = block.frequency_table().frequency(period='1790-1809') f1750 = block.frequency_table().frequency(period='1750-1769') else: f2000 = 0 f1950 = 0 f1900 = 0 f1850 = 0 f1800 = 0 f1750 = 0 definition = block.definition(src='oed') or None return BlockData(refentry, refid, block_type, stringtools.lexical_sort(block.lemma), block.lemma, block.wordclass(), definition, block.date().exact('start'), block.date().exact('end'), None, None, standardtypes, varianttypes, alientypes, _round_number(f2000), _round_number(f1950), _round_number(f1900), _round_number(f1850), _round_number(f1800), _round_number(f1750),)
def _find_secondary_lemmas(sense): # Any other lemmas (<lm> or <vl> elements) in the sense secondary_lemmas = set() for tag in ("lm", "vl", "vf"): for node in sense.node.findall(".//" + tag): text = etree.tounicode(node, method="text", with_tail=False) # Skip truncated stuff if text.startswith("-") or text.endswith("-"): pass else: secondary_lemmas.add(text) secondary_lemmas.discard(sense.lemma) return [(l, stringtools.lexical_sort(l)) for l in secondary_lemmas]
def _hyphen_match(inflections, tokens): match = None for token_full, token_flat in tokens: parts = token_full.split("-") if len(parts) == 1: continue parts = [(p, stringtools.lexical_sort(p)) for p in parts] for p_full, p_flat in parts: if p_flat in inflections: match = p_full if match: break return match
def populate_entries(): vsc = VitalStatisticsCache() entries = [] for entry in vsc.entries: row = Entry(id=entry.id, label=entry.label[:LABEL_LENGTH], alphasort=lexical_sort(entry.headword)[:ALPHASORT_LENGTH]) entries.append(row) if len(entries) > 1000: Entry.objects.bulk_create(entries) entries = [] Entry.objects.bulk_create(entries)
def ingest_sense(self, sense): # Clear the decks self.null_sense_level_attributes() # If the sense is a main sense defining a compound or similar, # we adopt the entry headword as a fallback in case the # full compound can't be found (useful for absol. uses, etc.). if sense.is_subentry() or sense.is_subentry_like(): fallback = None elif sense.lemma.lower() == self.entry_lemma.lower(): fallback = None elif len(sense.lemma.split()) <= 2: fallback = stringtools.lexical_sort(self.entry_lemma) else: fallback = None self.lemma = sense.lemma self.lemma_flat = stringtools.lexical_sort(sense.lemma) self.secondary_lemmas = _find_secondary_lemmas(sense) self.fallback_lemma = fallback self.wordclass = sense.primary_wordclass().penn self.inflections = _inflection_set(sense.lemma, self.wordclass) if sense.is_subentry() or sense.is_subentry_like(): variants = {} elif sense.lemma.lower() == self.entry_lemma.lower(): variants = {f: d for f, d in self.formslist.items()} else: variants = {} lemma_flat = stringtools.lexical_sort(sense.lemma) variants[lemma_flat] = 2050 # Allow "whippin'" for "whipping" if lemma_flat.endswith("ing"): variants[lemma_flat.rstrip("g")] = 2050 self.local_variants = variants self.local_variants_inflected = _inflect_variants(variants, self.wordclass)
def find_lemma(self, lemma, **kwargs): wordclass = kwargs.get('wordclass') locale = kwargs.get('locale') candidates = self.find_sortcode(lexical_sort(lemma)) candidates = [c for c in candidates if (c.lemma == lemma and (wordclass is None or c.wordclass == wordclass))] if locale == 'uk': candidates = [c for c in candidates if c.variant_type != 'us'] if locale == 'us': candidates = [c for c in candidates if c.variant_type != 'uk'] # Sort so that the longest and highest-scoring morphsets are at the top candidates.sort(key=lambda c: c.score, reverse=True) candidates.sort(key=len, reverse=True) return candidates
def _compile_ngrams(tokens, length): ngrams = [] for i in range(0, len(tokens)): try: window = tokens[i:i+length] except IndexError: pass else: ngram = ' '.join(window) ngram = ngram.strip(',:;().!?- ') ngram_flat = re.sub(r'<[^<>]+>', '', ngram) ngram_flat = stringtools.lexical_sort(ngram_flat) ngrams.append((ngram, ngram_flat)) return ngrams
def _phrase_match(lemma, tokens, bigrams): words = lemma.split() if len(words) < 3: return None match = None if words[0] == "to": phrase_words = words[1:] else: phrase_words = words[:] phrase_words = [[w, stringtools.lexical_sort(w)] for w in phrase_words] for w in phrase_words: w_flat = w[1] if len(w_flat) > 3: w.append(w_flat[0:3]) else: w.append(w_flat) phrase_flat = "".join([w[1] for w in phrase_words]) for token_full, token_flat in bigrams: if token_flat == phrase_flat: match = token_full break if not match: phrase_length = len(phrase_words) for i in range(0, len(tokens) - 1): try: ngram = tokens[i : i + phrase_length] except IndexError: pass else: match_failed = False for p_token, q_token in zip(phrase_words, ngram): if q_token[1].startswith(p_token[2]): pass elif p_token[0] in "one's" and q_token[0] in POSS_PRONOUNS: pass elif p_token[0] in "oneself" and q_token[0] in REFL_PRONOUNS: pass else: match_failed = True break if not match_failed: match = " ".join([t[0] for t in ngram]) break return match
def _sanitize_lemma(instance, wordclass): new_lemma = instance.lemma() new_lemma = re.sub(r'\([^()]+\)', '', new_lemma) new_lemma = re.sub(r'\([a-z]+$', '', new_lemma) new_lemma = re.sub(r' +', ' ', new_lemma) new_lemma = new_lemma.strip() if wordclass == 'VB': new_lemma = re.sub(r'^to ', '', new_lemma) if wordclass == 'NN': new_lemma = re.sub(r'^(the|a|an) ', '', new_lemma) new_lemma = new_lemma[0:LEMMA_LENGTH] if new_lemma != instance.lemma(): instance.node.find('./lemma').text = new_lemma instance.node.set('sortAlpha', lexical_sort(new_lemma))
def _compile_ngrams(tokens, length): ngrams = [] for i in range(0, len(tokens)): try: window = tokens[i : i + length] except IndexError: pass else: ngram = " ".join(window) ngram = ngram.strip(",:;().!?- ") ngram = re.sub(r"'s$", "", ngram) # check for internal punctuation if any([p in ngram for p in PUNCTUATION]): pass else: ngrams.append((ngram, stringtools.lexical_sort(ngram))) return ngrams
def _inflection_set(lemma, wordclass): lemma_flat = stringtools.lexical_sort(lemma) if wordclass == "NN": infs = { INFLECTOR.compute_inflection(lemma_flat, "NNS"), lemma_flat + "s", re.sub(r"(...)um$", r"\1a", lemma_flat), re.sub(r"(...)us$", r"\1i", lemma_flat), re.sub(r"(...)sis$", r"\1ses", lemma_flat), } elif wordclass == "VB": infs = { INFLECTOR.compute_inflection(lemma_flat, "VBZ"), INFLECTOR.compute_inflection(lemma_flat, "VBD"), INFLECTOR.compute_inflection(lemma_flat, "VBG"), INFLECTOR.compute_inflection(lemma_flat, "VBD", region="us"), INFLECTOR.compute_inflection(lemma_flat, "VBG", region="us"), lemma_flat + "in", lemma_flat + "eth", lemma_flat + "ethe", lemma_flat + "est", lemma_flat + "d", lemma_flat + "id", lemma_flat + "it", lemma_flat + "de", lemma_flat + "yng", lemma_flat + "ynge", } elif wordclass in ("JJ", "RB"): infs = { INFLECTOR.compute_inflection(lemma_flat, "JJR"), INFLECTOR.compute_inflection(lemma_flat, "JJS"), INFLECTOR.compute_inflection(lemma_flat, "JJR", region="us"), INFLECTOR.compute_inflection(lemma_flat, "JJS", region="us"), # We may as well throw in plural, since adj. and n. quotes # are often mixed together ('Zyrian', etc.) INFLECTOR.compute_inflection(lemma_flat, "NNS"), } else: infs = set() infs.add(lemma_flat) return infs
def _store_forms(block, entry, block_type, letter): us_variant = entry.us_variant() standardtypes = set() varianttypes = set() alientypes = set() for morphset in block.morphsets(): if morphset.form in (entry.lemma, us_variant, block.lemma): _add_types(morphset, standardtypes, letter) elif (block_type == 'entry' and morphset.date().end > VARIANT_MINIMUM_END_DATE and not morphset.is_nonstandard()): # Don't store variants for subentries; don't store # very old or non-standard variants _add_types(morphset, varianttypes, letter) _add_alien_variants(morphset, alientypes, letter) varianttypes = varianttypes - standardtypes alientypes = alientypes - standardtypes refentry, refid = block.link(target='oed', asTuple=True) frequency = block.frequency() if frequency is not None: frequency = float('%.2g' % frequency) if frequency > 1: frequency = int(frequency) definition = block.definition(src='oed') or None return BlockData(refentry, refid, block_type, stringtools.lexical_sort(block.lemma), block.lemma, block.wordclass(), definition, frequency, block.date().exact('start'), block.date().exact('end'), None, standardtypes, varianttypes, alientypes,)
def _deduplicate_instances(thesclass): thesclass.reload_instances() groups = defaultdict(list) for instance in thesclass.instances(): groups[lexical_sort(instance.lemma())].append(instance) deletions = [] for group in groups.values(): if len(group) > 1: z = [i for i in group if not i.is_obsolete()] or group[:] z.sort(key=lambda i: i.num_quotations(), reverse=True) z.sort(key=lambda i: i.start_date()) for instance in z[1:]: deletions.append(instance) if deletions: for instance in deletions: instance.selfdestruct() thesclass.reload_instances() thesclass.reset_size(len(thesclass.instances()))
def __init__(self, **kwargs): node = kwargs.get('node', None) morphunits = kwargs.get('morphunits', None) if node is not None: self.sortcode = node.get('sort') self.variant_type = node.get('variantType') self.id = node.get('id') self.morphunits = [MorphUnit(n.findtext('./wordForm'), n.get('pos')) for n in node.findall('./morphUnit')] self.score = int(node.get('score'))*2 or 0 elif morphunits is not None: self.morphunits = morphunits self.sortcode = lexical_sort(self.lemma) self.variant_type = kwargs.get('variant_type', 'default') self.score = kwargs.get('score', 0) self.id = kwargs.get('id', 0) self.source = self.lemma # this should remain unchanged if self.variant_type != 'us': self.score += 1 self.computed = False
def find_htlink(block, main_sense_data, confidence): if confidence is not None and confidence <= 2: #print(block.lemma, block.wordclass, confidence) return None if block.lemma in BANNED_LEMMAS: return None if block.wordclass not in ('NN', 'VB', 'JJ', 'RB', 'UH'): return None if (block.wordclass == 'RB' and not block.lemma.endswith('ly') and not block.lemma.endswith('wise') and not block.lemma.endswith('ways') and not block.lemma in ALLOWED_ADVERBS): return None if block.type == 'entry' and main_sense_data: main_sense = (block.refentry, main_sense_data.sense_id) elif block.type != 'entry': main_sense = (block.refentry, block.refid) else: main_sense = None if main_sense: qset = ThesaurusInstance.objects.filter(refentry=main_sense[0], refid=main_sense[1]) # Double-check that these are the right p.o.s... records = [r for r in qset if r.wordclass() == block.wordclass] # ...and roughly the right lemma records = ([r for r in records if r.lemma == block.lemma] or [r for r in records if stringtools.lexical_sort(r.lemma) == block.sort]) # sort so the record from the largest set is top records.sort(key=lambda r: r.thesclass.branch_size, reverse=True) if records and records[0].thesclass.node_size >= 3: return int(records[0].thesclass.id) return None
def _check_for_omissions(self): # If there's a secondary/alternative headword, check that this has # ended up included in the list of variants if self.lemma_manager.alt is not None: self.lemma_manager.refresh_variants_set() if not self.lemma_manager.in_variants_list(self.lemma_manager.alt.dictionary_sort): variant_form = VariantForm(self.lemma_manager.alt.lemma, self.date.start, self.date.projected_end()) self.lemma_manager.variants.append(variant_form) varsets = [] for varset in self.primary_sets(): if varset.lemma == self.lemma: varsets.append(varset) variant_forms = _filter_varsets(self.primary_sets(), self.wordclass, self.date) if variant_forms: self.lemma_manager.refresh_variants_set() for variant_form in variant_forms: if not self.lemma_manager.in_variants_list(lexical_sort(variant_form.form)): self.lemma_manager.variants.append(variant_form) # Check that the entry headword(s) is represented; given that the ODE # lemma form may be substituted for the original OED lemma form, it's # possible that it's not. if self.date.end > 1750: for headword in self.headwords: matches = [vf for vf in self.lemma_manager.variants if vf.form.replace('~', '') == headword.replace('~', '')] if not matches: variant_form = VariantForm(headword, self.date.start, self.date.projected_end()) self.lemma_manager.variants.append(variant_form)
def lexical_sort(self): try: return self._lexical_sort except AttributeError: self._lexical_sort = stringtools.lexical_sort(self.lemma) return self._lexical_sort
def text_lexical_sort(self): """ Test stringtools.lexical_sort() """ for source, _, result in self.test_texts: self.assertEqual(stringtools.lexical_sort(source), result)
def _sense_to_row(sense, status): if sense.definition is None: undefined = True definition = None else: undefined = False definition = sense.definition[:200] if sense.definition_supplement: definition_supplement = sense.definition_supplement[:150] else: definition_supplement = None try: reasoncode = sense.reason_code except AttributeError: reasoncode = None try: reasontext = sense.reason_text[:200] except (AttributeError, TypeError): reasontext = None try: thesclass1_id = sense.class_id except AttributeError: thesclass1_id = None try: thesclass2_id = sense.runners_up[0] except (AttributeError, IndexError): thesclass2_id = None try: thesclass3_id = sense.runners_up[1] except (AttributeError, IndexError): thesclass3_id = None if thesclass1_id is not None: thesclass = tdb.get_thesclass(thesclass1_id) level2branch = thesclass.ancestor(level=2) checkstatus = 'u' else: level2branch = None checkstatus = 'n' if level2branch is not None: level2branch_id = level2branch.id else: level2branch_id = None try: bayes = sense.bayes_classification bayes_confidence = sense.bayes_confidence except AttributeError: bayes = None bayes_confidence = 0 row = [ status, sense.lemma[:100], lexical_sort(sense.lemma)[:100], sense.wordclass or 'NN', definition, definition_supplement, sense.entry_id, sense.node_id, sense.entry_lemma[:50], lexical_sort(sense.entry_lemma)[:50], sense.subentry_type or 'main sense', undefined, random.randint(0, 10000), # sample order bayes, bayes_confidence, _bayes_mismatch(sense), thesclass1_id, thesclass2_id, thesclass3_id, 'u', # checkbox for thesclass1 (unset) 'i', # checkbox for thesclass2 (incorrect) 'i', # checkbox for thesclass3 (incorrect) checkstatus, level2branch_id, reasontext, reasoncode, sense.clone_num, # Gets changed to True/False before committing to DB ] return row
def tag_keyword(quotation, keyword): """ Having identified the keyword within the quotation text (using KeywordFinder), mark the keyword by adding <kw> tags around it. """ if keyword: serialized = etree.tounicode(quotation.text.node) qt_splitter = re.search(r'^(<qt(>| [^<>]*>))(.*)(</qt>)$', serialized) opentag = qt_splitter.group(1) text = ' ' + qt_splitter.group(3) + ' ' closetag = qt_splitter.group(4) text_tagged = None keyword = _clean_brackets(keyword) keyword = keyword.replace('*', '.').replace('+', '.') keyword_flat = stringtools.lexical_sort(keyword) matches = None for m in ('([ (>])(' + keyword + ')([,;:!?)<. ])', '(.)(' + keyword + ')([,;:!?)<. -])', '([^a-zA-Z])(' + keyword + ')([^a-zA-Z])', '([ (>-])(' + keyword + ')(.)', '(.)(' + keyword + ')(.)'): matches = re.findall(m, text) if matches: break if matches: prec, match, following = matches[0] before = prec + match + following after = prec + '<kw>' + match + '</kw>' + following text_tagged = text.replace(before, after) if not text_tagged: text2 = re.sub(r'<([a-z]+) [^<>]*/>', r'<\1/>', text) text2 = re.sub(r'<([a-z]+) [^<>]*>', r'<\1>', text2) tokens = text2.split() for token in tokens: token2 = re.sub(r'<[^<>]+>', '', token) token2 = token2.strip(',:;!?.()') if token2 == keyword: target = token.strip(',:;!?.()') text_tagged = text2.replace(target, '<kw>' + target + '</kw>') break if not text_tagged: for round in (1, 2): text2 = re.sub(r'<([a-z]+) [^<>]*/>', r'<\1/>', text) text2 = re.sub(r'<([a-z]+) [^<>]*>', r'<\1>', text2) # text_true is the version we'll actually be tagging # - with ellipses, etc., still in place text_true = text2 if round == 2: # Replace ellipses and m-dashes with spaces, so that # adjacent words get tokenized for char in ('\u2025', '\u2026', '\u2014'): text2 = text2.replace(char, ' ') # Tokenize and make into ngrams tokens = text2.split() ngrams = (_compile_ngrams(tokens, 1) + _compile_ngrams(tokens, 2) + _compile_ngrams(tokens, 3) + _compile_ngrams(tokens, 4) + _compile_ngrams(tokens, 5) + _compile_ngrams(tokens, 6)) target = None for ngram_full, ngram_flat in ngrams: if keyword_flat == ngram_flat: target = ngram_full break if target: # Strip ellipses and dashes target = target.strip('\u2025\u2026\u2014') text_tagged = text_true.replace(target, '<kw>' + target + '</kw>') break if not text_tagged: keyword_tokens = keyword.split() if len(keyword_tokens) >= 2: first = re.findall(keyword_tokens[0], text) last = re.findall(keyword_tokens[-1], text) if len(first) == 1 and len(last) == 1: pattern = ('(' + keyword_tokens[0] + '.*?' + keyword_tokens[-1] + ')') text_tagged = re.sub(pattern, r'<kw>\1</kw>', text) #print('----------------------------------------------------') #print(serialized) #print('|' + keyword + '|') #print(text_tagged) if text_tagged and '<kw>' in text_tagged: serialized_tagged = opentag + text_tagged.strip() + closetag try: node_tagged = etree.fromstring(serialized_tagged) except etree.XMLSyntaxError: pass else: parent = quotation.text.node.getparent() parent.replace(quotation.text.node, node_tagged) else: pass
def _replace_lemma(entry, headword): return entry._replace(lemma=headword, sort=stringtools.lexical_sort(headword))