예제 #1
0
def get_sub_content(link, f:'_io.TextIOWrapper'):
    #print("http://"+baseurl[0:-1]+link)
    (req, ans) = get_content("http://"+baseurl+link)
    soup=BeautifulSoup(ans,from_encoding="utf-8")
    content = soup.find(class_='CardView')
    content = content.find_all('table')
    result = ''
    for table in content[0:2]:
        for td in table.find_all('td')[1::2]:
            result+=td.get_text()+';'
    result += content[2].find_all('td')[4].get_text()+';'
    result = result.replace('\t',  '').replace('\n',  '').replace('\x0D',  '')
    result = normalUtf(result)
    f.write(result+'\n')
    f.flush()
예제 #2
0
def inf_from_table(table:'BeautifulSoup',f:'_io.TextIOWrapper'):
    rows=table.find_all('tr',class_='RowsTable_Default')
    rows2=table.find_all('tr',class_='RowsTable_Default_')
    rows=rows+rows2
    for row in rows:
        str=''
        row_data=row.find_all('td')
        #<notice_number>
        str=str+row_data[1].get_text()+';'
        #</notice_number>
        #<subject>
        str=str+row_data[2].get_text()+';'
        #</subject>
        #<price>
        str=str+row_data[3].get_text()+';'
        #</price>
        #<organizer>
        str=str+row_data[4].get_text()+';'
        #</organizer>
        #<publication_date_time>
        str=str+row_data[5].get_text()+';'
        #</publication_date_time>
        #<start_date>
        str=str+row_data[6].get_text()+';'
        #</start_date>
        #<start_time>
        str=str+row_data[7].get_text()+';'
        #</start_time>
        #<state>
        str=str+row_data[8].get_text()+';'
        #</state>
        #<url>
        link = row_data[0].find('a');
        if not (link is None):
            #print(link['href'])
            str=str+link['href'].replace('\n', '')+';'
        #</url>
        str=normalUtf(str)
        f.write(str)
        f.flush()
        
        #begin full info about zakaz
        get_sub_content(link['href'], f)
        #end full info about zakaz
        return link['href']
            main_table=soup.find("table",class_="tbl_org tbl_org_zakon tbl_org_regedit tbl_torgs ")
            lines=main_table.find_all("tr")
            g=1
            num_ln=len(lines)
            while g < num_ln:
                        _str='';
                        cur_line=lines[g]
                        #begin get information about competition
                        link = cur_line['onclick']
                        url_with_full_inf = link.split("'")[1]
                        _str+=subInfo(url_with_full_inf)
                        #end get information about competition

                        data_in_line=cur_line.find_all("td")
                        #<customer>
                        _str=_str+data_in_line[0].get_text()+';'
                        #</customer>
                        #<subject>
                        _str=_str+data_in_line[1].get_text()+';'
                        #</subject>
                        #<date>
                        _str=_str+data_in_line[2].get_text()+';'
                        #</date>
                        g=g+1
                        _str = normalUtf(_str)
                        inf_file.write(_str)
                        inf_file.write("\n")
            inf_file.flush()
            cur_page=cur_page+1
inf_file.close()