Exemplo n.º 1
0
def _get_job_desc(doc):
    for pt in job_desc_patterns:
        ps = doc.xpath(pt)
        r = ""
        for p in ps:
            r += etree.tounicode(p, pretty_print=True)
        r = content_format(r)
        removed_special_token_content = re.sub(ur'岗位要求|岗位职责|工作描述|职位描述| ', '', r)
        if len(removed_special_token_content) < 10:
            continue
        return DtString(r, DataFlag(hasValue=True).toByte())

    els = doc.xpath(job_desc_pattern_other)
    r = ""

    for el in els:
      r += etree.tounicode(el, pretty_print=True)
      r = content_format(r)
      return DtString(r, DataFlag(hasValue=True).toByte())

    return DtString()
Exemplo n.º 2
0
def fetch_content_by_patterns(doc, patterns):

    for pat in patterns:
        els = doc.xpath(pat)
        r = ""
        for el in els:
            r += el.text_content()
            r += '\n'

        r = content_format(r)
        if len(r) < 10:
            continue
        return r
    return ""
Exemplo n.º 3
0
def _get_inc_intro(doc):

    for pat in company_patterns:
        els = doc.xpath(pat)
        r = ""
        for el in els:
            r += el.text_content()
            r += '\n'

        r = content_format(r)

        if len(r)>10:
            return DtString(r, DataFlag(hasValue=True).toByte())

    return DtString()