Exemplo n.º 1
0
def process_ws(items):
    l = heilongjiang_list(items)
    a = heilongjiang_article(items)

    ws = WenshuBase('\n'.join(a[5:]))
    litigants, agents = litigants_agent_extract('\n'.join(ws.role_paragraph))
    court_officers = court_extract('\n'.join(ws.court_paragraph))
    type = type_extract(l.get('title'))
    reasons = reason_extract(reason_description=ws.reason_description,
                             title=l.get('title'),
                             trial_type=type)
    trial_date = trial_date_extract(''.join(ws.court_paragraph))
    court_level = court_level_extract(l.get('court_name'))
    trial_round = trial_round_extract(l.get('title'))
    content_type = content_type_extract(ws.verdict_paragraph, l.get('title'))
    case_no = a[4]
    claim = ''
    if content_type == '判决书' and trial_round == '一审':
        for reason in reasons:
            if reason['reason_code_level2'] == 104 or reason[
                    'reason_code_level2'] == 105:
                claim = claim_extract(ws.claims_paragraphs)
    obj = {
        'case_no': pbracket(case_no),
        'reasons': reasons,
        'source': '黑龙江市高级人民法院',
        'type': type,
        'title': pbracket(l.get('title')),
        'content': '<br>'.join(a[5:]),
        'agents': agents,
        'update_time': update_time(),
        'litigants': litigants,
        'content_type': content_type,
        'trial_round': trial_round,
        'court_level': court_level,
        'verdict': ws.verdict,
        'trial_date': trial_date,
        'court_officers': court_officers,
        'court_name': l.get('court_name'),
        'claim': claim,
        'operator': 'leifeng',
        'instrument_id': get_md5(l.get('title')) + get_md5(pbracket(case_no))
    }

    ines(id=obj['instrument_id'],
         path='http://10.1.1.28:9200/judge_doc/local_doc',
         data=obj)
    if is_exists(url='http://10.1.1.28:9200/judge_doc/total_doc',
                 field='case_no',
                 value=obj['case_no']):
        ines(id=obj['instrument_id'],
             path='http://10.1.1.28:9200/judge_doc/total_doc',
             data=obj)
Exemplo n.º 2
0
def process_ws(items):
    try:
        obj = mining(items)
        ines(id=obj['instrument_id'],
             path='http://10.1.1.28:9200/judge_doc/local_doc',
             data=obj)
        if is_exists(url='http://10.1.1.28:9200/judge_doc/total_doc',
                     field='case_no',
                     value=obj['case_no']):
            ines(id=obj['instrument_id'],
                 path='http://10.1.1.28:9200/judge_doc/total_doc',
                 data=obj)
    except Exception as e:
        id = get_md5(items.get('title')) + get_md5(
            pbracket(items.get('case_no', '')))
        obj = {
            "_reason_": str(e),
            "data_size": len(items),
            "crawl_time": update_time(),
            "processed": False,
            "hostname": "worker1.yscredit.com",
            "data": items,
            "create_time": update_time(),
            "ip": "null",
            "_id_": id,
            "topic": "裁判文书"
        }
        print(obj)
        ines(id=id,
             path='http://10.1.1.28:9200/fail_record/fail_record',
             data=obj)
Exemplo n.º 3
0
def mining(items):
    obj = {}
    a = items.get('articles')
    articles = eval(items.get('articles')) if a else []
    article = '\n'.join(articles)
    title = items.get('title')

    # 必须包括的
    obj['case_no'] = items.get('case_no', '')
    obj['publish_date'] = items.get('publish_date')
    obj['court_name'] = items.get('court_name', '')
    obj['source'] = items.get('source')
    obj['title'] = items.get('title', '')
    obj['update_time'] = update_time()
    obj['org_url'] = items.get('org_url')

    # 可能不在, 自己提取
    obj['type'] = items.get('type') if items.get('type') else type_extract(
        title)
    obj['trial_round'] = items.get('trial_round') if items.get(
        'trial_round') else trial_round_extract(title)
    obj['content_type'] = content_type_extract(content_type=items.get(
        'content_type')) if items.get('content_type') else None
    content_type = items.get('content_type')
    reason = items.get('reason')
    trial_date = items.get('trial_date')
    if articles:
        ws = WenshuBase(article)
        litigants, agents = litigants_agent_extract('\n'.join(
            ws.role_paragraph))
        court_officers = court_extract('\n'.join(ws.court_paragraph))
        # print(ws.claims_paragraphs)
        litigation_request, claim = claim_extract(ws.claims_paragraphs)
        trial_date = trial_date if trial_date else trial_date_extract(
            ws.court_paragraph)
        court_level = court_level_extract(obj.get('court_name'))
        obj['litigants'] = litigants
        obj['agents'] = agents
        obj['court_officers'] = court_officers
        obj['court_level'] = court_level
        obj['content'] = '<br>'.join(articles)
        obj['content_type'] = content_type if content_type else content_type_extract(
            verdict=ws.verdict_paragraph, title=obj.get('title'))
        obj['reasons'] = reason_extract(ws.reason_description,
                                        obj.get('title'), obj.get('type'),
                                        reason)
        obj['verdict'] = ws.verdict
        obj['trial_date'] = trial_date
        obj['litigation_request'] = litigation_request
        obj['claim'] = claim
    obj['instrument_id'] = get_md5(obj.get('title')) + get_md5(
        pbracket(obj.get('case_no')))
    return obj
Exemplo n.º 4
0
     trial_type=type)
 trial_date = trial_date_extract(''.join(ws.court_paragraph))
 court_level = court_level_extract(l.get('court_name'))
 trial_round = trial_round_extract(l.get('title'))
 content_type = content_type_extract(ws.verdict_paragraph,
                                     l.get('title'))
 case_no = l.get('case_no')
 claim = ''
 if content_type == '判决书' and trial_round == '一审':
     for reason in reasons:
         if reason['reason_code_level2'] == 104 or reason[
                 'reason_code_level2'] == 105:
             claim = claim_extract(ws.claims_paragraphs)
 obj = {
     'case_no':
     pbracket(l.get('case_no')),
     'reasons':
     reasons,
     'source':
     '青海高级人民法院',
     'type':
     type,
     'title':
     pbracket(l.get('title')),
     'content':
     '<br>'.join(a),
     'agents':
     agents,
     'update_time':
     update_time(),
     'litigants':
Exemplo n.º 5
0
def process_ws(items):
    obj = {}
    shls = shanghai_list(items)
    trial_type = shanghai_trial_type(items)
    court_name = shanghai_court_name(items)
    # content = shanghai_content(items)
    article = shanghai_aricle(items)

    ws = WenshuBase(article)
    litigants, agents = litigants_agent_extract('\n'.join(ws.role_paragraph))
    court_officers = court_extract('\n'.join(ws.court_paragraph))
    content_type = content_type_extract(ws.verdict_paragraph, shls['title'])
    reasons = reason_extract(reason_description=ws.reason_description,
                             title=shls['title'],
                             trial_type=trial_type)
    court_level = court_level_extract(court_name)
    claim = ''
    if content_type == '判决书' and shls['trial_round'] == '一审':
        for reason in reasons:
            if reason['reason_code_level2'] == 104 or reason[
                    'reason_code_level2'] == 105:
                claim = claim_extract(ws.claims_paragraphs)
    obj = {
        'case_no':
        pbracket(shls['case_no']),
        'reasons':
        reasons,
        'source':
        '上海市高级人民法院',
        'type':
        trial_type,
        'title':
        pbracket(shls['title']),
        'content':
        re.sub('\n', '<br>', article),
        'agents':
        agents,
        'update_time':
        update_time(),
        'litigants':
        litigants,
        'content_type':
        content_type,
        'trial_round':
        shls['trial_round'],
        'court_level':
        court_level,
        'verdict':
        ws.verdict,
        'trial_date':
        shls['trial_date'],
        'court_officers':
        court_officers,
        'court_name':
        court_name,
        'claim':
        claim,
        'operator':
        'leifeng',
        'instrument_id':
        get_md5(shls['title']) + get_md5(pbracket(shls['case_no']))
    }
    ines(id=obj['instrument_id'],
         path='http://10.1.1.28:9200/judge_doc/local_doc',
         data=obj)
    if is_exists(url='http://10.1.1.28:9200/judge_doc/total_doc',
                 field='case_no',
                 value=obj['case_no']):
        ines(id=obj['instrument_id'],
             path='http://10.1.1.28:9200/judge_doc/total_doc',
             data=obj)
Exemplo n.º 6
0
 litigants, agents = litigants_agent_extract('\n'.join(ws.role_paragraph))
 court_officers = court_extract('\n'.join(ws.court_paragraph))
 type = type_extract(l.get('title'))
 reasons = reason_extract(reason_description=ws.reason_description, title=l.get('title'), trial_type=type)
 trial_date = trial_date_extract(''.join(ws.court_paragraph))
 court_level = court_level_extract(l.get('court_name'))
 trial_round = trial_round_extract(l.get('title'))
 content_type = content_type_extract(ws.verdict_paragraph, l.get('title'))
 case_no = l.get('case_no')
 claim = ''
 if content_type == '判决书' and trial_round == '一审':
     for reason in reasons:
         if reason['reason_code_level2'] == 104 or reason['reason_code_level2'] == 105:
             claim = claim_extract(ws.claims_paragraphs)
 obj = {
     'case_no': pbracket(l.get('case_no')),
     'reasons': reasons,
     'source': '吉林高级人民法院',
     'type': type,
     'title': pbracket(l.get('title')),
     'content': '<br>'.join(a),
     'agents': agents,
     'update_time': update_time(),
     'litigants': litigants,
     'content_type': content_type,
     'trial_round': trial_round,
     'court_level': court_level,
     'verdict': ws.verdict,
     'trial_date': trial_date,
     'court_officers': court_officers,
     'court_name': l.get('court_name'),
Exemplo n.º 7
0
def person_extract(role_paragraph):
    """采集当事人和代理人员信息
    """
    regs = reg()
    persons = []
    role_paragraphs = role_paragraph.split('\n')
    del_paragraphs = [
        rp for rp in role_paragraphs if ('执行裁定书' or '执行决定书' or '执行通知书') in rp
    ]
    for dp in del_paragraphs:
        role_paragraph = role_paragraph.replace(dp, '')
    role_paragraphs = role_paragraph.split('\n')

    for num, r in enumerate(regs):
        for items in re.findall(r, role_paragraph, re.M):
            identity_type, identity, party_position = '', '', ''
            role = items[0]
            sentence = items[1]
            if (')' in sentence or ')' in sentence) and (
                    '(' not in sentence or '(' not in sentence) and num == 0:
                pass
            else:
                if sentence:
                    _sentence = sentence[1:] if sentence[0] in [
                        ',', ',', '。', '.', ';', ';', '∶'
                    ] else sentence
                    symbols = list(
                        set([
                            _sentence.find(i)
                            for i in [',', ',', '。', '.', ';', ';']
                        ]))
                    symbols.remove(-1)
                    symbols = sorted(symbols)
                    if symbols:
                        _name = ps(_sentence[:symbols[0]])
                        resume = _sentence[symbols[0]:]
                        resume = resume[1:] if resume[0] in [
                            ',', ',', '。', '.', ';'
                        ] else resume
                    else:
                        _name = ps(_sentence)
                        resume = ''

                    bir = pb(resume)
                    i = pr(_name) if _name else None
                    if i:
                        name = _name[:i]
                        remark = _name[i:]
                    else:
                        name = _name
                        remark = ''

                    law_firm_sentence = re.findall(
                        "[,,。.;;](.*?(?:律师事务所|法律服务所))", resume)
                    law_firm_sentence = law_firm_sentence[
                        0] if law_firm_sentence else resume
                    law_firm = re.findall("系?(.*?(?:律师事务所|法律服务所))",
                                          law_firm_sentence)
                    law_firm = law_firm[0] if law_firm else ''

                    s = [t for t in role_paragraphs if sentence in t][0]
                    local = role_paragraphs.index(s)
                    role_paragraphs[local] = ''
                    role_paragraph = '\n'.join(role_paragraphs)

                    if role:
                        identity_type, identity, party_position = person_mapping[role].get('identity_type'), \
                                                                  person_mapping[
                                                                      role].get('identity'), person_mapping[role].get(
                            'party_position', '')
                    persons.append({
                        'name': pbracket(name),
                        'local': local,
                        'resume': resume,
                        'role': role,
                        'sentence': s,
                        'bir': bir,
                        'remark': remark,
                        'identity_type': identity_type,
                        'identity': identity,
                        'party_position': party_position,
                        'law_firm': pbracket(law_firm)
                    })
    persons = sorted(
        persons, key=lambda x: x['local']) if persons else [{
            'name': p,
            'local': 0,
            'resume': '',
            'role': '',
            'sentence': ''
        } for p in re.findall('(.*?)[::,,;;]$', role_paragraph, re.M)]

    return persons