예제 #1
0
    def __init__(self, config):
        """Weibo类初始化"""
        self.config = config
        # change cookie from string to dict
        if type(self.config['cookie']) == type(u''):
            self.config['cookie'] = {
                t.strip().split("=")[0]: t.strip().split("=")[1]
                for t in self.config['cookie'].split(";")
            }
        if type(self.config['user_id_list']) == type(u""):
            user_id_list = self.config['user_id_list']
            if not os.path.isabs(user_id_list):
                user_id_list = os.path.split(
                    os.path.realpath(__file__))[0] + os.sep + user_id_list
            self.config['user_id_list'] = user_id_list
            with open(self.config['user_id_list'], 'rb') as f:
                lines = f.read().splitlines()
                lines = [line.decode('utf-8') for line in lines]
                self.config['user_id_list'] = [
                    line.split(' ')[0] for line in lines if
                    len(line.split(' ')) > 0 and line.split(' ')[0].isdigit()
                ]
        if type(self.config['since_date']) == type(0):
            self.config['since_date'] = str(
                date.today() - timedelta(self.config['since_date']))

        self.validator = Validator(self.config)
        self.validator.validate()
        self.printer = Printer()
        self.writer = Writer(self.config)
        self.downloader = Downloader(self.config)
        self.parser = Parser(self.config)
    def __init__(self, driver, street_number, street_name):
        # Go to SDAT property search website
        driver.navigate_to_website(
            "http://sdat.dat.maryland.gov/RealProperty/Pages/default.aspx")

        # counties include BALTIMORE CITY, BALTIMORE COUNTY, etc
        driver.select_search_param("BALTIMORE CITY")

        # find property
        driver.search_property(street_number, street_name)

        # scrape html
        html_string = driver.page_source
        parser = Parser(html_string)

        # hash for finding data
        ids_to_search = {
            'owner_name': 'lblOwnerName_0',
            'use': 'lblUse_0',
            'principal_residence': 'lblPrinResidence_0',
            'mailing_address': 'lblMailingAddress_0',
            'deed_reference': 'lblDedRef_0',
            'premises_address': 'lblPremisesAddress_0',
            'legal_description': 'lblLegalDescription_0',
            'primary_structure_built': 'Label18_0',
            'above_grade_living_area': 'Label19_0',
            'finished_basement_area': 'Label27_0',
            'property_land_area': 'Label20_0',
            'stories': 'Label22_0',
            'basement': 'Label23_0',
            'bldg_type': 'Label24_0',
            'exterior': 'Label25_0',
            'full_half_bath': 'Label34_0',
            'garage': 'Label35_0',
            'last_major_reno': 'Label36_0',
            'last_deed_transfer_seller': 'Label38_0',
            'last_deed_transfer_date': 'Label39_0',
            'last_deed_transfer_price': 'Label40_0',
            'last_deed_transfer_type': 'Label41_0'
        }

        data_dict = {}

        for key, val in ids_to_search.items():
            data = parser.strip_data_from_html(val)
            data_dict[key] = data

        self.property_data = data_dict
예제 #3
0
 def scrap_one(self, region):
     res = []
     i = regions.index(region) + 2
     self.params['CourtRegion[]'] = i
     __raw_html = post_request(self.url, self.params)
     # with open('html_dump', 'w') as f:
     #     f.write(__raw_html)
     p = Parser()
     elems = p.get_tds(__raw_html)
     del elems[0]
     for e in elems:
         res.append(CourtCase().from_dict(to_dict(e)))
     dump_dir = os.path.realpath('dump')
     with open(dump_dir + '/dump_' + region, 'w') as f:
         for e in res:
             f.write(e.__str__() + '\n')
     return res
    def get_article(self, url):

        # инициализируем UrlHandler
        urlhandler = UrlHandler()
        # получаем веб-страницу и её кодировку
        source_page, encoding = urlhandler.load_page(url)

        # инициализируем парсер, текстовый процессор, экстрактор
        html_parser = Parser(source_page, encoding)
        text_processor = TextProcessor(self.language)
        article_extractor = ArticleExtractor(self.language)
        formatter = Formatter()

        # получаем списки элементов, очищенных от тегов (raw_cleaned_elements)
        # и нет (elements_as_string)
        raw_cleaned_elements, elements_as_string = html_parser.get_parsed_nodes(
        )
        # заголовок
        title = html_parser.get_title()

        # получаем спосок лемматизированных текстов
        stemmed_tag_elements = text_processor.iterate_over_texts(
            raw_cleaned_elements)
        # получаем ранжированный список элементов
        best_nodes = article_extractor.find_best_node(stemmed_tag_elements)

        # для первого элемента из ранжированного списка
        # ищем в цикле нужный элемент с тегами (elements_as_string)
        # передаем найденный элемент в out_formatter
        for text, element in zip(raw_cleaned_elements, elements_as_string):
            if best_nodes[0][0] == text:
                node_to_format = element

        # out_formatter подготавливает текст для сохранения
        clean_text = formatter.format_article(node_to_format)

        # сохраняем в текстовый файл
        with codecs.open('output.txt', 'w', 'utf-8') as out:
            out.write(title + '\n\n')
            for paragraph in clean_text:
                for line in paragraph:
                    out.write(line)
                    out.write('\n')
                out.write('\n')
예제 #5
0
class Spider(object):
    def __init__(self, config):
        """Weibo类初始化"""
        self.config = config
        # change cookie from string to dict
        if type(self.config['cookie']) == type(u''):
            self.config['cookie'] = {
                t.strip().split("=")[0]: t.strip().split("=")[1]
                for t in self.config['cookie'].split(";")
            }
        if type(self.config['user_id_list']) == type(u""):
            user_id_list = self.config['user_id_list']
            if not os.path.isabs(user_id_list):
                user_id_list = os.path.split(
                    os.path.realpath(__file__))[0] + os.sep + user_id_list
            self.config['user_id_list'] = user_id_list
            with open(self.config['user_id_list'], 'rb') as f:
                lines = f.read().splitlines()
                lines = [line.decode('utf-8') for line in lines]
                self.config['user_id_list'] = [
                    line.split(' ')[0] for line in lines if
                    len(line.split(' ')) > 0 and line.split(' ')[0].isdigit()
                ]
        if type(self.config['since_date']) == type(0):
            self.config['since_date'] = str(
                date.today() - timedelta(self.config['since_date']))

        self.validator = Validator(self.config)
        self.validator.validate()
        self.printer = Printer()
        self.writer = Writer(self.config)
        self.downloader = Downloader(self.config)
        self.parser = Parser(self.config)

    def get_nickname(self):
        """获取用户昵称"""
        url = 'https://weibo.cn/%s/info' % (self.user['id'])
        selector = self.parser.deal_html(url, self.config['cookie'])
        nickname = selector.xpath('//title/text()')[0]
        nickname = nickname[:-3]
        if nickname == u'登录 - 新' or nickname == u'新浪':
            write_log(self.config['since_date'])
            sys.exit(u'cookie错误或已过期,请按照README中方法重新获取')
        self.user['nickname'] = nickname

    def get_user_info(self, selector):
        """获取用户昵称、微博数、关注数、粉丝数"""
        self.get_nickname()  # 获取用户昵称
        user_info = selector.xpath("//div[@class='tip2']/*/text()")

        self.user['weibo_num'] = int(user_info[0][3:-1])
        self.user['following'] = int(user_info[1][3:-1])
        self.user['followers'] = int(user_info[2][3:-1])
        self.printer.print_user_info(self.user)
        self.writer.write_user(self.user)
        print('*' * 100)

    def get_one_page(self, page):
        """获取第page页的全部微博"""
        url = 'https://weibo.cn/u/%s?page=%d' % (self.user['id'], page)
        selector = self.parser.deal_html(url, self.config['cookie'])
        info = selector.xpath("//div[@class='c']")
        is_exist = info[0].xpath("div/span[@class='ctt']")
        if is_exist:
            for i in range(0, len(info) - 2):
                weibo = self.parser.get_one_weibo(info[i])
                if weibo:
                    if weibo['id'] in self.weibo_id_list:
                        continue
                    publish_time = datetime.strptime(
                        weibo['publish_time'][:10], "%Y-%m-%d")
                    since_date = datetime.strptime(self.config['since_date'],
                                                   "%Y-%m-%d")
                    if publish_time < since_date:
                        if self.parser.is_pinned_weibo(info[i]):
                            continue
                        else:
                            return True
                    self.printer.print_one_weibo(weibo)

                    self.weibo.append(weibo)
                    self.weibo_id_list.append(weibo['id'])
                    self.got_num += 1
                    print('-' * 100)

                    self.writer.write_weibo([weibo])

    def get_weibo_info(self):
        """获取微博信息"""
        url = 'https://weibo.cn/u/%s' % (self.user['id'])
        selector = self.parser.deal_html(url, self.config['cookie'])
        self.get_user_info(selector)  # 获取用户昵称、微博数、关注数、粉丝数

        page_num = self.parser.get_page_num(selector)  # 获取微博总页数
        page1 = 0
        random_pages = random.randint(1, 5)
        for page in tqdm(range(1, page_num + 1), desc='Progress'):
            is_end = self.get_one_page(page)  # 获取第page页的全部微博
            if is_end:
                break

            # 通过加入随机等待避免被限制。爬虫速度过快容易被系统限制(一段时间后限
            # 制会自动解除),加入随机等待模拟人的操作,可降低被系统限制的风险。默
            # 认是每爬取1到5页随机等待6到10秒,如果仍然被限,可适当增加sleep时间
            if page - page1 == random_pages and page < page_num:
                sleep(random.randint(6, 10))
                page1 = page
                random_pages = random.randint(1, 5)

        if not self.config['filter']:
            print(u'共爬取' + str(self.got_num) + u'条微博')
        else:
            print(u'共爬取' + str(self.got_num) + u'条原创微博')

    def initialize_info(self, user_id):
        """初始化爬虫信息"""
        self.got_num = 0  # 爬取到的微博数
        self.weibo = []  # 存储爬取到的所有微博信息
        self.user = {'id': user_id}  # 存储爬取到的用户信息
        self.weibo_id_list = []  # 存储爬取到的所有微博id

    def start(self):
        """运行爬虫"""
        for user_id in self.config['user_id_list']:
            self.initialize_info(user_id)
            print('*' * 100)
            self.get_weibo_info()
            print(u'信息抓取完毕')
            print('*' * 100)
            if self.config['pic_download'] == 1:
                file_path = get_filepath('img', self.user['nickname'])
                self.downloader.download_files(file_path, 'img', self.weibo)
            if self.config['video_download'] == 1:
                file_path = get_filepath('video', self.user['nickname'])
                self.downloader.download_files(file_path, 'video', self.weibo)
예제 #6
0
파일: main.py 프로젝트: n0mercy071/tensor
        with open(path, 'w', encoding='utf-8') as file:
            file.write(text)

        print('Готово.\nПуть: ' + path)


def create_argparser():
    '''Создает парсем аргументов командной строки'''
    parser = argparse.ArgumentParser()
    parser.add_argument('url', nargs='?')

    return parser


if __name__ == '__main__':
    parser = Parser()
    argparser = create_argparser()
    namespace = argparser.parse_args()

    if namespace.url:
        save_text(namespace.url)
    else:
        # Выполняется если нет аргументов
        while True:
            user_input = input('url или exit для выхода: ')
            if user_input == 'exit':
                break
            # Обновляет список селеторов из конфига
            elif user_input == 'refresh':
                parser.refresh_selectors()
                print('Селекторы обновлены')
예제 #7
0
from html_parser import Parser

# начальная страница
url = 'http://google.com/'

# получение списка ссылок из класса Parser
links = Parser.url_request(None, url)

# показываем ссылки
print("Ссылки с ресурса", url)
for link in links:
    print(" ->", link[0])

    # получение списка ссылок второго уровня
    links_2 = Parser.url_request(None, link[0])
    for link2 in links_2:
        print("второй уровень ->", link2[0])
예제 #8
0
from html_parser import Parser
from counter import Counter
from database import DataBase

URL = "https://www.google.com/"
HEADERS = {
    'user-agent':
    'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
    'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/88.0.4324.190 '
    'Safari/537.36',
    'accept':
    '*/*'
}
DIVS = " ,:@.-()/!?"

if __name__ == '__main__':
    URL = input("Type your URL: ")
    html_text = Parser(url=URL, headers=HEADERS).parse()
    c = Counter(html_text, divider=DIVS).count_words()
    print(c)
    db = DataBase()
    db.make_database(c)