Exemplo n.º 1
0
def save_rollbook(mid, title, c1):
    body = "mid\t회차\tpid\t이름\t정당\t출석여부\n"
    c1_str = str(c1)
    c1_l = c1_str.split("attend_div")
    for c1 in c1_l[1:]:
        c1_t = bs(c1, "html.parser")
        c1_type_raw = c1_t.find("td").previous_element.previous_element
        c1_type = re.search(r"\w+", c1_type_raw)[0]
        c1_s = str(c1_t)
        pt_l = c1_s.split('<span style="color: Array; font-size: 110%;">')
        for pt in pt_l[1:]:
            pt_t = bs(pt, "html.parser")
            pt_name = re.search(r"\w+", pt_t.find("strong").string)[0]
            per_l = pt_t.find_all("a")
            line = ""
            for per in per_l:
                p_id = re.search(r"\d+", per['href'])[0]
                p_name = per.string.strip()
                line += f"{mid}\t{title}\t{p_id}\t{p_name}\t{pt_name}\t{c1_type}\n"
            body += line
    tsv_IO = StringIO(body)
    body_df = pd.read_csv(tsv_IO, sep="\t")
    docs = body_df.to_dict(orient='record')
    with MyMongo() as db:
        db.delete_and_insert('assembly', 'watch_main_meeting_rollbook', docs, {'mid': int(mid)})
Exemplo n.º 2
0
def load_burial_set_geocode(collection):
    with MyMongo() as db:
        # cemetry = list(db.find('burial', 'cemetry', {'lat': {'$exists': False}}))
        burial = list(
            db.find('burial', collection, {'lat': {
                '$exists': False
            }}))

    # print(cemetry[1])
    # print(enshrinement)

    queries = []
    for doc in burial:
        address = get_geocode_from_address(doc['주소'], None, doc['시설명'])
        if type(address) == str:
            print('Address Not Found.')
            print(doc['주소'], doc['시설명'])
            continue

        # lat, lng = address['y'], address['x']
        doc['lat'] = address['y']
        doc['lng'] = address['x']
        doc['filter0'] = address.get('region_1depth_name')
        doc['filter1'] = address.get('region_2depth_name')
        doc['filter2'] = address.get('region_3depth_name')

        queries.append(
            UpdateOne({'_id': doc['_id']}, {'$set': doc}, upsert=True))

        if len(queries) == 20:
            with MyMongo() as db:
                obj = db.get_table_obj('burial', collection)
                result = obj.bulk_write(queries)
                print_bulk_result(result)
                queries.clear()

    with MyMongo() as db:
        obj = db.get_table_obj('burial', collection)
        result = obj.bulk_write(queries)
        print_bulk_result(result)
        queries.clear()
Exemplo n.º 3
0
def load_market_set_geocode(schema, collection):
    with MyMongo() as db:
        # parking = list(db.find('market', collection, {'위도': '0.000'}))
        docs = list(db.find(schema, collection, {'위도': np.nan}))

    # print(cemetry[1])
    # print(enshrinement)

    queries = []
    for doc in docs:
        address = get_geocode_from_address(str(doc['소재지도로명주소']), None,
                                           str(doc['시장명']))
        if type(address) == str:
            print('Address Not Found.')
            print(doc['소재지도로명주소'], doc['시장명'])
            continue

        # lat, lng = address['y'], address['x']
        doc['위도'] = address['y']
        doc['경도'] = address['x']
        doc['filter0'] = address.get('region_1depth_name')
        doc['filter1'] = address.get('region_2depth_name')
        doc['filter2'] = address.get('region_3depth_name')

        queries.append(
            UpdateOne({'_id': doc['_id']}, {'$set': doc}, upsert=True))

        if len(queries) == 20:
            with MyMongo() as db:
                obj = db.get_table_obj(schema, collection)
                result = obj.bulk_write(queries)
                print_bulk_result(result)
                queries.clear()

    with MyMongo() as db:
        obj = db.get_table_obj(schema, collection)
        result = obj.bulk_write(queries)
        print_bulk_result(result)
        queries.clear()
Exemplo n.º 4
0
from lib.scrape import get_soup_from_url
from lib.util import print_bulk_result

# Get Total From Web
url_total = 'http://watch.peoplepower21.org/?act=&mid=AssemblyMembers&vid=&mode=search&name=&party=&region=&sangim=&gender=&age=&elect_num=&singlebutton='
soup = get_soup_from_url(url_total)
text = soup.get_text()
re_obj = re.search(r'총\s(\d+)명', text)
count_total_from_web = int(re_obj.group(1).replace(',', ''))
# print(count_total_from_web)

if not count_total_from_web:
    raise ValueError('count_total_from_web = 0')

# Get Total from db
with MyMongo() as db:
    print('Get row count from db.')
    member_table = db.get_df_from_table('assembly', 'watch_member')
    # count_total_from_db = member_table.count()

    try:
        seq_list_from_db = set(member_table['seq'].tolist())
        print(seq_list_from_db)
    except KeyError:
        print('No Member Found from assembly/watch_member')
        seq_list_from_db = set()

# Get page count
count_num_of_records = 30
count_total_page = count_total_from_web // count_num_of_records + 1
Exemplo n.º 5
0
def ETL_quotes(file_name, collection_name):
    data_dir = '/Users/jake/OneDrive - leverage innovative users/Documents/News_Item/Audit_quote/'
    quote_file = os.path.join(data_dir, file_name)
    quote = pd.read_csv(quote_file, sep='\t', dtype=object)
    quote.head()
    quote['url'] ="https://www.bigkinds.or.kr/news/detailView.do?docId="+quote['뉴스 식별자']+"&returnCnt=1&sectionDiv=1000&indexName="
    members = ['강길부', '강병원', '강석진', '강석호', '강창일', '강효상', '강훈식', '경대수', '고용진', '곽대훈', '곽상도', '권미혁', '권성동', '권은희', '권칠승', '금태섭', '기동민', '김경진', '김경협', '김관영', '김광림', '김광수', '김규환', '김기선', '김도읍', '김동철', '김두관', '김명연', '김무성', '김민기', '김병관', '김병기', '김병욱', '김부겸', '김삼화', '김상훈', '김상희', '김석기', '김선동', '김성수', '김성식', '김성원', '김성찬', '김성태', '김성태', '김성환', '김세연', '김수민', '김순례', '김승희', '김영우', '김영주', '김영진', '김영춘', '김영호', '김용태', '김재경', '김재원', '김정우', '김정재', '김정호', '김정훈', '김종대', '김종민', '김종석', '김종회', '김종훈', '김중로', '김진태', '김진표', '김철민', '김태년', '김태흠', '김학용', '김한정', '김한표', '김해영', '김현권', '김현미', '김현아', '나경원', '남인순', '노웅래', '도종환', '맹성규', '문진국', '문희상', '민경욱', '민병두', '민홍철', '박경미', '박광온', '박대출', '박덕흠', '박맹우', '박명재', '박범계', '박병석', '박선숙', '박성중', '박순자', '박영선', '박완수', '박완주', '박용진', '박인숙', '박재호', '박정', '박주민', '박주선', '박주현', '박지원', '박찬대', '박홍근', '백승주', '백재현', '백혜련', '변재일', '서삼석', '서영교', '서청원', '서형수', '설훈', '성일종', '소병훈', '손금주', '손혜원', '송갑석', '송기헌', '송석준', '송언석', '송영길', '송옥주', '송희경', '신경민', '신동근', '신보라', '신상진', '신용현', '신창현', '심기준', '심상정', '심재권', '심재철', '안규백', '안민석', '안상수', '안호영', '어기구', '엄용수', '여상규', '염동열', '오신환', '오영훈', '오제세', '우상호', '우원식', '원유철', '원혜영', '위성곤', '유기준', '유동수', '유민봉', '유성엽', '유승민', '유승희', '유은혜', '유의동', '유재중', '윤관석', '윤상직', '윤상현', '윤소하', '윤영석', '윤영일', '윤일규', '윤재옥', '윤종필', '윤준호', '윤한홍', '윤호중', '윤후덕', '이개호', '이군현', '이규희', '이동섭', '이만희', '이명수', '이상돈', '이상민', '이상헌', '이석현', '이수혁', '이양수', '이언주', '이완영', '이용득', '이용주', '이용호', '이우현', '이원욱', '이은권', '이은재', '이인영', '이장우', '이재정', '이정미', '이정현', '이종걸', '이종구', '이종명', '이종배', '이주영', '이진복', '이찬열', '이채익', '이철규', '이철희', '이춘석', '이태규', '이학영', '이학재', '이해찬', '이헌승', '이현재', '이혜훈', '이후삼', '이훈', '인재근', '임이자', '임재훈', '임종성', '장병완', '장석춘', '장정숙', '장제원', '전재수', '전해철', '전현희', '전혜숙', '전희경', '정갑윤', '정동영', '정병국', '정성호', '정세균', '정양석', '정용기', '정우택', '정운천', '정유섭', '정인화', '정재호', '정종섭', '정진석', '정춘숙', '정태옥', '제윤경', '조경태', '조배숙', '조승래', '조원진', '조응천', '조정식', '조훈현', '주광덕', '주승용', '주호영', '지상욱', '진선미', '진영', '채이배', '천정배', '최경환', '최경환', '최교일', '최도자', '최연혜', '최운열', '최인호', '최재성', '추경호', '추미애', '추혜선', '표창원', '하태경', '한선교', '한정애', '함진규', '홍문종', '홍문표', '홍영표', '홍의락', '홍익표', '홍일표', '홍철호', '황영철', '황주홍', '황희']

    members2 = []  # 박영 선 case
    members3 = []  # 박 영선 case

    for m in members:
        if len(m) == 3:
            members2.append(m[0:2] + ' ' + m[2])
            members3.append(m[0:1] + ' ' + m[1:3])
        else:
            members2.append(m)
            members3.append(m)

    quote['정보원_추출'] = quote['정보원'].apply(extract_name)
    idx_gt_3 = quote['정보원_추출'].str.len() >= 3; idx_gt_3
    idx_lt_2 = quote['정보원_추출'].str.len() <= 2; idx_lt_2

    recognized = []

    for i, row in quote.iterrows():
        value = row['정보원_추출']
        flag = False
        recognized_name = ''
        for m in members:
            if m in value:
                recognized.append(True)
                flag = True
                recognized_name = m
                break
        if not flag:
            for m2 in members2:
                if m2 in value:
                    recognized.append(True)
                    flag = True
                    recognized_name = m
                    break
        if not flag:
            for m3 in members3:
                if m3 in value:
                    recognized.append(True)
                    flag = True
                    recognized_name = m
                    break
        if flag:
            quote.at[i, '정보원_추출2'] = recognized_name
        else:
            recognized.append(False)

    idx_recognized = pd.Series(recognized)
    idx_unknown = ~idx_recognized
    idx_len_1 = quote['정보원_추출'].str.len() == 1

    idx_error = []
    status_error = []
    requests_error = []

    for i, row in quote.loc[idx_len_1, ['뉴스 식별자', '정보원_추출']].iterrows():
    #     if i == 14:
    #         break
        news_id = row['뉴스 식별자']
        url = f'https://www.bigkinds.or.kr/news/detailView.do?docId={news_id}&returnCnt=1&sectionDiv=1000&indexName='

        try:
            response = requests.get(url, timeout=3, verify=False)

        except ReadTimeout:
            requests_error.append(i)
            continue
        except ContentDecodingError:
            requests_error.append(i)
            continue
        except TooManyRedirects:
            requests_error.append(i)
            continue

        if str(response.status_code)[0] == '4':
            status_error.append(i)
        people = response.json()['detail']['TMS_NE_PERSON']

        try:
            first_character = row['정보원_추출'][0]
        except IndexError as e:
            idx_error.append(i)
            continue

        people_list = list(filter(lambda x: filter_refer(x, first_character), people.split('\n')))
        people_only_legislator = [p for p in people_list if p in members]

        if len(people_only_legislator) == 1:
            result = people_only_legislator[0]
        else:
            result = ''

        quote.at[i, '정보원_추출2'] = result

    quote['정보원_추출2'] = quote['정보원_추출2'].fillna('')
    idx_not_empty = quote['정보원_추출2'] != ''
    idx_empty = ~idx_not_empty
    quote = quote.rename(columns={'정보원_추출2': '국회의원'})
    quote.columns

    with MyMongo() as db:
        db.delete_and_insert_df('audit_quote', collection_name, quote.loc[idx_not_empty])

    print(requests_error)
    print(quote.loc[idx_empty]['정보원'].tolist())