def get_label_type(expl, word): wt = proper(word.Type) if word.Type is not None else "" # Extract the list of item enclosed in {{ }} # For each item found , if there is | inside , then split and take only longest word # Convert all non 0-9 A-Z into _ # Deduplicate _ _ into single _ # Make all words in upper case list1 = [] for t in expl.find_objects( Template, recursive=True, exclude=[li for li in expl.find_objects((Li, Dl))]): inner = t.raw s = convert_to_alnum(inner, '_') s = deduplicate(s, '_') s = s.strip('_') words = [] for ws in s.split("|"): for w in ws.split('_'): words += w.split(' ') s = get_lognest_word(words) s = s.upper() list1.append(s) # Extract the list of item enclosed in [[ ]] # For each item found , if there is | inside , then split and take only longest word # Convert all non 0-9 A-Z into _ # Deduplicate _ _ into single _ # Make all words with first letter uppercase and others lower case (propercase) list2 = [] for l in expl.find_objects( Link, recursive=True, exclude=[li for li in expl.find_objects((Li, Dl))]): s = l.get_text() s = convert_to_alnum(s, '_') s = deduplicate(s, '_') s = s.strip('_') words = [] for ws in s.split('_'): for w in ws.split(' '): words.append(w) s = get_lognest_word(words) s = proper(s) list2.append(s) # remove all [ ( { ) ] } from the line, and extract all words separated by spaces # keep only words having a lenght>=3 # Convert all non 0-9 A-Z into _ # Deduplicate _ _ into single _ # Make all words in lowercase list3 = [] words = [] for w in expl.find_objects( String, recursive=False, exclude=[li for li in expl.find_objects((Li, Dl))]): words.append(w.get_text()) s = " ".join(words) s = s.replace('(', ' ').replace(')', ' ') s = deduplicate(s, ' ') s = convert_to_alnum(s) s = deduplicate(s, '_') s = s.strip('_') words = [] for ws in s.split('_'): for w in ws.split(' '): words.append(w) list3 = [w.lower() for w in words if len(w) >= 3] # Add TYPE + (the 4 first items of the concatenated list : list1 + List2 + list3 # Concat #print(expl.raw) #print(list1) #print(list2) #print(list3) #print() biglst = list1 + list2 + list3 #if len(biglst) < 4: # biglst = unique(biglst) return wt + "_" + "_".join(biglst[:4])
def get_label_type(page, tense, infinitive, pronoun): # Verb_To_do_You_Indicative Perfect s = "Verb" + '_' + proper(page.label) + '_' + proper(pronoun) + '_' + tense s = s.replace(' ', '_') return s
def get_label_type_v1(expl, word): wt = proper(word.Type) if word.Type is not None else "" # list1 = [] for t in expl.find_objects( (Template, Link), recursive=True, exclude=[li for li in expl.find_objects((Li, Dl))]): inner = t.raw s = convert_to_alnum(inner) s = s.replace('_', ' ') s = deduplicate(s, ' ') s = s.strip() splitted = s.split(" ") list1 += [w.upper() for w in splitted] list1 = [w for w in list1 if len(w) >= 3] list2 = [] for l in expl.find_objects( Link, recursive=True, exclude=[li for li in expl.find_objects((Li, Dl))]): inner = l.get_text() s = convert_to_alnum(inner) s = deduplicate(s) s = s.strip("_").strip() splitted = s.split("_") list2 += [proper(w) for w in splitted] list3 = [] texts = [] for c in expl.childs: if isinstance(c, Li): # skip deep lists continue if isinstance(c, Template): # skip Templates {{ }} continue if isinstance(c, Template): # skip Links [[ ]] continue else: texts.append(c.get_text()) s = "".join(texts) s = convert_to_alnum(s) s = s.replace('_', ' ') s = deduplicate(s, ' ') s = s.strip() splitted = s.split(" ") list3 += [w.lower() for w in splitted] list3 = [w for w in list3 if len(w) >= 4] # Concat biglst = list1 + list2 + list3 if len(biglst) == 1: return wt + "-" + biglst[0] elif len(biglst) >= 3: return wt + "-" + "-".join(biglst[:3]) else: return wt