def get_label_type(expl, word):
    wt = proper(word.Type) if word.Type is not None else ""

    # Extract the list of item enclosed in {{ }}
    # For each item found , if there is | inside , then split and take only longest word
    # Convert all non 0-9 A-Z into _
    # Deduplicate _ _ into single _
    # Make all words in upper case
    list1 = []
    for t in expl.find_objects(
            Template,
            recursive=True,
            exclude=[li for li in expl.find_objects((Li, Dl))]):
        inner = t.raw
        s = convert_to_alnum(inner, '_')
        s = deduplicate(s, '_')
        s = s.strip('_')
        words = []
        for ws in s.split("|"):
            for w in ws.split('_'):
                words += w.split(' ')
        s = get_lognest_word(words)
        s = s.upper()
        list1.append(s)

    # Extract the list of item enclosed in [[ ]]
    # For each item found , if there is | inside , then split and take only longest word
    # Convert all non 0-9 A-Z into _
    # Deduplicate _ _ into single _
    # Make all words with first letter uppercase and others lower case (propercase)
    list2 = []
    for l in expl.find_objects(
            Link,
            recursive=True,
            exclude=[li for li in expl.find_objects((Li, Dl))]):
        s = l.get_text()
        s = convert_to_alnum(s, '_')
        s = deduplicate(s, '_')
        s = s.strip('_')
        words = []
        for ws in s.split('_'):
            for w in ws.split(' '):
                words.append(w)
        s = get_lognest_word(words)
        s = proper(s)
        list2.append(s)

    # remove all [ ( { ) ] } from the line, and extract all words separated by spaces
    # keep only words having a lenght>=3
    # Convert all non 0-9 A-Z into _
    # Deduplicate _ _ into single _
    # Make all words in lowercase
    list3 = []
    words = []
    for w in expl.find_objects(
            String,
            recursive=False,
            exclude=[li for li in expl.find_objects((Li, Dl))]):
        words.append(w.get_text())

    s = " ".join(words)
    s = s.replace('(', ' ').replace(')', ' ')
    s = deduplicate(s, ' ')
    s = convert_to_alnum(s)
    s = deduplicate(s, '_')
    s = s.strip('_')

    words = []
    for ws in s.split('_'):
        for w in ws.split(' '):
            words.append(w)
    list3 = [w.lower() for w in words if len(w) >= 3]

    # Add TYPE + (the 4 first items of the concatenated list :  list1 + List2 + list3
    # Concat
    #print(expl.raw)
    #print(list1)
    #print(list2)
    #print(list3)
    #print()
    biglst = list1 + list2 + list3

    #if len(biglst) < 4:
    #    biglst = unique(biglst)

    return wt + "_" + "_".join(biglst[:4])
Пример #2
0
def get_label_type(page, tense, infinitive, pronoun):
    # Verb_To_do_You_Indicative Perfect
    s = "Verb" + '_' + proper(page.label) + '_' + proper(pronoun) + '_' + tense
    s = s.replace(' ', '_')
    return s
def get_label_type_v1(expl, word):
    wt = proper(word.Type) if word.Type is not None else ""

    #
    list1 = []
    for t in expl.find_objects(
        (Template, Link),
            recursive=True,
            exclude=[li for li in expl.find_objects((Li, Dl))]):
        inner = t.raw
        s = convert_to_alnum(inner)
        s = s.replace('_', ' ')
        s = deduplicate(s, ' ')
        s = s.strip()
        splitted = s.split(" ")
        list1 += [w.upper() for w in splitted]
        list1 = [w for w in list1 if len(w) >= 3]

    list2 = []
    for l in expl.find_objects(
            Link,
            recursive=True,
            exclude=[li for li in expl.find_objects((Li, Dl))]):
        inner = l.get_text()
        s = convert_to_alnum(inner)
        s = deduplicate(s)
        s = s.strip("_").strip()
        splitted = s.split("_")
        list2 += [proper(w) for w in splitted]

    list3 = []
    texts = []
    for c in expl.childs:
        if isinstance(c, Li):  # skip deep lists
            continue
        if isinstance(c, Template):  # skip Templates {{ }}
            continue
        if isinstance(c, Template):  # skip Links [[ ]]
            continue
        else:
            texts.append(c.get_text())
    s = "".join(texts)
    s = convert_to_alnum(s)
    s = s.replace('_', ' ')
    s = deduplicate(s, ' ')
    s = s.strip()
    splitted = s.split(" ")
    list3 += [w.lower() for w in splitted]
    list3 = [w for w in list3 if len(w) >= 4]

    # Concat
    biglst = list1 + list2 + list3

    if len(biglst) == 1:
        return wt + "-" + biglst[0]

    elif len(biglst) >= 3:
        return wt + "-" + "-".join(biglst[:3])

    else:
        return wt