def __init__(self, config): """Weibo类初始化""" self.config = config # change cookie from string to dict if type(self.config['cookie']) == type(u''): self.config['cookie'] = { t.strip().split("=")[0]: t.strip().split("=")[1] for t in self.config['cookie'].split(";") } if type(self.config['user_id_list']) == type(u""): user_id_list = self.config['user_id_list'] if not os.path.isabs(user_id_list): user_id_list = os.path.split( os.path.realpath(__file__))[0] + os.sep + user_id_list self.config['user_id_list'] = user_id_list with open(self.config['user_id_list'], 'rb') as f: lines = f.read().splitlines() lines = [line.decode('utf-8') for line in lines] self.config['user_id_list'] = [ line.split(' ')[0] for line in lines if len(line.split(' ')) > 0 and line.split(' ')[0].isdigit() ] if type(self.config['since_date']) == type(0): self.config['since_date'] = str( date.today() - timedelta(self.config['since_date'])) self.validator = Validator(self.config) self.validator.validate() self.printer = Printer() self.writer = Writer(self.config) self.downloader = Downloader(self.config) self.parser = Parser(self.config)
def __init__(self, driver, street_number, street_name): # Go to SDAT property search website driver.navigate_to_website( "http://sdat.dat.maryland.gov/RealProperty/Pages/default.aspx") # counties include BALTIMORE CITY, BALTIMORE COUNTY, etc driver.select_search_param("BALTIMORE CITY") # find property driver.search_property(street_number, street_name) # scrape html html_string = driver.page_source parser = Parser(html_string) # hash for finding data ids_to_search = { 'owner_name': 'lblOwnerName_0', 'use': 'lblUse_0', 'principal_residence': 'lblPrinResidence_0', 'mailing_address': 'lblMailingAddress_0', 'deed_reference': 'lblDedRef_0', 'premises_address': 'lblPremisesAddress_0', 'legal_description': 'lblLegalDescription_0', 'primary_structure_built': 'Label18_0', 'above_grade_living_area': 'Label19_0', 'finished_basement_area': 'Label27_0', 'property_land_area': 'Label20_0', 'stories': 'Label22_0', 'basement': 'Label23_0', 'bldg_type': 'Label24_0', 'exterior': 'Label25_0', 'full_half_bath': 'Label34_0', 'garage': 'Label35_0', 'last_major_reno': 'Label36_0', 'last_deed_transfer_seller': 'Label38_0', 'last_deed_transfer_date': 'Label39_0', 'last_deed_transfer_price': 'Label40_0', 'last_deed_transfer_type': 'Label41_0' } data_dict = {} for key, val in ids_to_search.items(): data = parser.strip_data_from_html(val) data_dict[key] = data self.property_data = data_dict
def scrap_one(self, region): res = [] i = regions.index(region) + 2 self.params['CourtRegion[]'] = i __raw_html = post_request(self.url, self.params) # with open('html_dump', 'w') as f: # f.write(__raw_html) p = Parser() elems = p.get_tds(__raw_html) del elems[0] for e in elems: res.append(CourtCase().from_dict(to_dict(e))) dump_dir = os.path.realpath('dump') with open(dump_dir + '/dump_' + region, 'w') as f: for e in res: f.write(e.__str__() + '\n') return res
def get_article(self, url): # инициализируем UrlHandler urlhandler = UrlHandler() # получаем веб-страницу и её кодировку source_page, encoding = urlhandler.load_page(url) # инициализируем парсер, текстовый процессор, экстрактор html_parser = Parser(source_page, encoding) text_processor = TextProcessor(self.language) article_extractor = ArticleExtractor(self.language) formatter = Formatter() # получаем списки элементов, очищенных от тегов (raw_cleaned_elements) # и нет (elements_as_string) raw_cleaned_elements, elements_as_string = html_parser.get_parsed_nodes( ) # заголовок title = html_parser.get_title() # получаем спосок лемматизированных текстов stemmed_tag_elements = text_processor.iterate_over_texts( raw_cleaned_elements) # получаем ранжированный список элементов best_nodes = article_extractor.find_best_node(stemmed_tag_elements) # для первого элемента из ранжированного списка # ищем в цикле нужный элемент с тегами (elements_as_string) # передаем найденный элемент в out_formatter for text, element in zip(raw_cleaned_elements, elements_as_string): if best_nodes[0][0] == text: node_to_format = element # out_formatter подготавливает текст для сохранения clean_text = formatter.format_article(node_to_format) # сохраняем в текстовый файл with codecs.open('output.txt', 'w', 'utf-8') as out: out.write(title + '\n\n') for paragraph in clean_text: for line in paragraph: out.write(line) out.write('\n') out.write('\n')
class Spider(object): def __init__(self, config): """Weibo类初始化""" self.config = config # change cookie from string to dict if type(self.config['cookie']) == type(u''): self.config['cookie'] = { t.strip().split("=")[0]: t.strip().split("=")[1] for t in self.config['cookie'].split(";") } if type(self.config['user_id_list']) == type(u""): user_id_list = self.config['user_id_list'] if not os.path.isabs(user_id_list): user_id_list = os.path.split( os.path.realpath(__file__))[0] + os.sep + user_id_list self.config['user_id_list'] = user_id_list with open(self.config['user_id_list'], 'rb') as f: lines = f.read().splitlines() lines = [line.decode('utf-8') for line in lines] self.config['user_id_list'] = [ line.split(' ')[0] for line in lines if len(line.split(' ')) > 0 and line.split(' ')[0].isdigit() ] if type(self.config['since_date']) == type(0): self.config['since_date'] = str( date.today() - timedelta(self.config['since_date'])) self.validator = Validator(self.config) self.validator.validate() self.printer = Printer() self.writer = Writer(self.config) self.downloader = Downloader(self.config) self.parser = Parser(self.config) def get_nickname(self): """获取用户昵称""" url = 'https://weibo.cn/%s/info' % (self.user['id']) selector = self.parser.deal_html(url, self.config['cookie']) nickname = selector.xpath('//title/text()')[0] nickname = nickname[:-3] if nickname == u'登录 - 新' or nickname == u'新浪': write_log(self.config['since_date']) sys.exit(u'cookie错误或已过期,请按照README中方法重新获取') self.user['nickname'] = nickname def get_user_info(self, selector): """获取用户昵称、微博数、关注数、粉丝数""" self.get_nickname() # 获取用户昵称 user_info = selector.xpath("//div[@class='tip2']/*/text()") self.user['weibo_num'] = int(user_info[0][3:-1]) self.user['following'] = int(user_info[1][3:-1]) self.user['followers'] = int(user_info[2][3:-1]) self.printer.print_user_info(self.user) self.writer.write_user(self.user) print('*' * 100) def get_one_page(self, page): """获取第page页的全部微博""" url = 'https://weibo.cn/u/%s?page=%d' % (self.user['id'], page) selector = self.parser.deal_html(url, self.config['cookie']) info = selector.xpath("//div[@class='c']") is_exist = info[0].xpath("div/span[@class='ctt']") if is_exist: for i in range(0, len(info) - 2): weibo = self.parser.get_one_weibo(info[i]) if weibo: if weibo['id'] in self.weibo_id_list: continue publish_time = datetime.strptime( weibo['publish_time'][:10], "%Y-%m-%d") since_date = datetime.strptime(self.config['since_date'], "%Y-%m-%d") if publish_time < since_date: if self.parser.is_pinned_weibo(info[i]): continue else: return True self.printer.print_one_weibo(weibo) self.weibo.append(weibo) self.weibo_id_list.append(weibo['id']) self.got_num += 1 print('-' * 100) self.writer.write_weibo([weibo]) def get_weibo_info(self): """获取微博信息""" url = 'https://weibo.cn/u/%s' % (self.user['id']) selector = self.parser.deal_html(url, self.config['cookie']) self.get_user_info(selector) # 获取用户昵称、微博数、关注数、粉丝数 page_num = self.parser.get_page_num(selector) # 获取微博总页数 page1 = 0 random_pages = random.randint(1, 5) for page in tqdm(range(1, page_num + 1), desc='Progress'): is_end = self.get_one_page(page) # 获取第page页的全部微博 if is_end: break # 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限 # 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默 # 认是每爬取1到5页随机等待6到10秒,如果仍然被限,可适当增加sleep时间 if page - page1 == random_pages and page < page_num: sleep(random.randint(6, 10)) page1 = page random_pages = random.randint(1, 5) if not self.config['filter']: print(u'共爬取' + str(self.got_num) + u'条微博') else: print(u'共爬取' + str(self.got_num) + u'条原创微博') def initialize_info(self, user_id): """初始化爬虫信息""" self.got_num = 0 # 爬取到的微博数 self.weibo = [] # 存储爬取到的所有微博信息 self.user = {'id': user_id} # 存储爬取到的用户信息 self.weibo_id_list = [] # 存储爬取到的所有微博id def start(self): """运行爬虫""" for user_id in self.config['user_id_list']: self.initialize_info(user_id) print('*' * 100) self.get_weibo_info() print(u'信息抓取完毕') print('*' * 100) if self.config['pic_download'] == 1: file_path = get_filepath('img', self.user['nickname']) self.downloader.download_files(file_path, 'img', self.weibo) if self.config['video_download'] == 1: file_path = get_filepath('video', self.user['nickname']) self.downloader.download_files(file_path, 'video', self.weibo)
with open(path, 'w', encoding='utf-8') as file: file.write(text) print('Готово.\nПуть: ' + path) def create_argparser(): '''Создает парсем аргументов командной строки''' parser = argparse.ArgumentParser() parser.add_argument('url', nargs='?') return parser if __name__ == '__main__': parser = Parser() argparser = create_argparser() namespace = argparser.parse_args() if namespace.url: save_text(namespace.url) else: # Выполняется если нет аргументов while True: user_input = input('url или exit для выхода: ') if user_input == 'exit': break # Обновляет список селеторов из конфига elif user_input == 'refresh': parser.refresh_selectors() print('Селекторы обновлены')
from html_parser import Parser # начальная страница url = 'http://google.com/' # получение списка ссылок из класса Parser links = Parser.url_request(None, url) # показываем ссылки print("Ссылки с ресурса", url) for link in links: print(" ->", link[0]) # получение списка ссылок второго уровня links_2 = Parser.url_request(None, link[0]) for link2 in links_2: print("второй уровень ->", link2[0])
from html_parser import Parser from counter import Counter from database import DataBase URL = "https://www.google.com/" HEADERS = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) ' 'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 ' 'Safari/537.36', 'accept': '*/*' } DIVS = " ,:@.-()/!?" if __name__ == '__main__': URL = input("Type your URL: ") html_text = Parser(url=URL, headers=HEADERS).parse() c = Counter(html_text, divider=DIVS).count_words() print(c) db = DataBase() db.make_database(c)