예제 #1
0
    def _render_list_item(self, concept):
        concept.render_object.li = Pq('<li></li>')
        concept.render_object.li.attr('id', '%s-%s' % (concept.id, 'li'))
        concept.render_object.li.append(
            concept.render_object.phr_span.children().remove())
        concept.render_object.phr_span.append(concept.render_object.li)

        if AutoAttributeEngine.is_ordered_list(concept.get_parent(), None,
                                               self.document):
            concept.render_object.render_as_ordered_list = True
        elif AutoAttributeEngine.is_unordered_list(concept.get_parent(), None,
                                                   self.document):
            concept.render_object.render_as_unordered_list = True
 def findEachBuilding(self, html):
     doc = Pq(html)
     a_list = doc("a.e_huangse")
     for a in a_list:
         self._apartment_detail["BUILDING_NUM"] = doc(a).text()
         href = doc(a).attr("onclick")
         href = href[href.index("'") + 1:]
         href = href[:href.index("'")]
         url = self._base_url + href
         # doc_str = self.get_page_content_str(url)
         # elf._extract_data(doc_str)
         # time.sleep(1)
         self.save_building(url)
    def _extract_data(self, doc_str):
        doc = Pq(doc_str)
        self._comcode_detail["province"] = doc('.content>ul>li>h1').text()
        doc = Pq(doc_str)
        tr_list = doc('.content>table>tr')

        for tr in tr_list:
            try:
                # time.sleep(1)
                td_list = doc(tr).find("td")
                self._comcode_detail["city"] = doc(td_list[0]).find("a").text()
                a_list = doc(td_list[1]).find("a")
                for a in a_list:
                    self._comcode_detail["area"] = doc(a).text()
                    url = self._base_url + doc(a).attr("href")
                    # html = self.get_page_content_str(url)
                    # self._extract_data2(html)
                    insert_sql = " INSERT INTO fetch_list2 (source_id, url,times,page,STATUS) VALUE(98,'{}',0,0,0)".format(
                        url)
                    print("insert sql is [" + insert_sql)
                    Dao.execute_dmls(insert_sql)
            except IndexError as er:
                print("error in " + doc(tr).text())
예제 #4
0
def extractKeywords(subpagecfg: dict, keydic: dict, websitedomain: str, name: str,
                    keywordmap: dict, othermap: dict, tagkeyword: dict, filterwords=None):
    url = str(keydic.get('website'))
    if url is not None:
        if url.startswith(websitedomain):
            html = SpiderApi.getPageSourceCode(url)
            try:
                contentselector = subpagecfg.get('contentselector')
                keyselector = subpagecfg.get('keyselector1')
                while True:
                    soup = BeautifulSoup(html, 'lxml')
                    SpiderApi.deleteNoise(soup)  # 删除style、script等标签
                    allcontent = soup.text
                    allcontent = allcontent.replace('\n', '').replace('\r', '')
                    # 提取startdate、enddate字段
                    extractDateByRegular(allcontent, othermap, keydic)
                    # 精确提取location字段
                    preciseExtractLocation(allcontent, othermap, keydic)
                    tablesoup = BeautifulSoup(html, 'lxml')
                    content = soup.select(contentselector)
                    if len(content) > 0:
                        tmpsoup = BeautifulSoup(str(content[0]), 'lxml')
                        content = content[0].get_text()
                        writeToFile(name + '.txt', content)
                        lines = formatReadlines(name + '.txt')
                        removeFile(name + '.txt')
                        # print(content)
                        # print('*  ' * 50)
                        table = tablesoup.select(keyselector)
                        if len(table) > 0:
                            tablehtml = table[0]
                            elements = []
                            for row in tablehtml.children:
                                if not isinstance(row, NavigableString):
                                    rowcontent = str(row.get_text()).replace('\t', '') \
                                        .replace('\r\n', '').replace('\n', '')
                                    elements.append(rowcontent)
                            # for ele in elements:
                            #     print(ele)
                            matchKeywords(elements, websitedomain, keywordmap, othermap, keydic)
                            extractWebsiteField(lines, tmpsoup, websitedomain, keydic)
                            extractTagFiles(tagkeyword, keydic, filterwords)
                        break
                    else:
                        html = getHtmlCore(html)
                        doc = Pq(html)
                        html = str(doc(contentselector))
            except Exception as e:
                print("method extractKeywords exec exception:\n {}".format(traceback.format_exc()))
예제 #5
0
def get_products():
    html = browser.page_source
    doc = Pq(html)
    items = doc('#mainsrp-itemlist .items .item').items()
    for item in items:
        product = {
            'image': item.find('.pic .img').attr('data-src'),
            'price': item.find('.price').text().replace('\n', ''),
            'deal': item.find('.deal-cnt').text(),
            'title': item.find('.title').text().replace('\n', ''),
            'shop': item.find('.shopname').text(),
            'location': item.find('.location').text()
        }
        print(product)
        save_to_mongo(product)
 def _extract_data(self, url):
     community_id = self._save_community()
     doc_str = self.get_page_content_str(url)
     doc = Pq(doc_str)
     tr_list = doc("table>tr")
     try:
         for tr in tr_list:
             Floor_num = Pq(tr)("td:eq(0)").text()
             a_list = doc(tr).find("td.preview>a")
             for a in a_list:
                 apartment_detail = {
                     'COMMUNITY_ID': community_id,
                     'FLOOR_NUM': Floor_num,
                     'APARTMENT_NUM': doc(a).text(),
                     'STATUS': '2',
                     'create_time': time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                 }
                 self._save_apartment(apartment_detail)
         sql = "update communities set status = '2' where ORIGINAL_URL = '{}' ; ".format(url)
         Dao.execute_dmls(sql)
     except Exception as  e:
         print(e)
         sql = "update communities set status = -1 where ORIGINAL_URL = '{}' ; ".format(url)
         Dao.execute_dmls(sql)
예제 #7
0
def book_from_div(div):
    """
    从一个 div 里面获取到一本书的信息
    """
    e = Pq(div)
    # 小作用域变量用单字符
    b = Book()
    b.name = e('.title').text()
    b.score = e('.rating_nums').text() or '0'
    b.evaluate = e('.pl').text()
    b.meta = e('.abstract').text()
    b.url = e('.title-text').attr('href')
    # xmlns这个属性可能导致pyquery无法如下解析
    b.cover_url = e('.cover').attr('src')
    return b
예제 #8
0
 def find_url_from_ul(self, ul):
     """
     对每一个ul 进行解析
     """
     doc = Pq(ul)
     li_list = doc("li")
     for li in li_list:
         url = self._base_url + doc(li).find("div>p>a").attr("href")
         if url in self.detail_info_urls:
             continue
         else:
             self._merchant_detail["url"] = url
             self.detail_info_urls.append(url)
             html = self.get_page_content_str(url)
             self._extract_data2(html)
예제 #9
0
    def __init__(self, pro, doc, group, organization=None):

        self.project = pro
        self.document = doc
        self.groups = [group.key, Group.get_worldshare().key]
        self.organization = organization

        self.user = User()
        self.user.groups = self.groups

        if organization:
            self.user.organization = organization.key

        self.html = ''
        self.body = Pq('<span></span>')
 def _extract_data(self, doc_str, apartment_detail):
     try:
         doc = Pq(doc_str)
         a_list = doc("table>tr>td>table>tr>td")
         # total_item =int( doc("").text().strip())
         # count_num = int(total_item) / 12
         for a in a_list:
             apartment_detail["APARTMENT_NUM"] = doc(a).text()
             if apartment_detail["APARTMENT_NUM"].strip() != '':
                 apartment_detail["create_time"] = time.strftime(
                     '%Y-%m-%d %H:%M:%S', time.localtime(time.time()))
                 self._save_community(apartment_detail)
     except Exception as err:
         print(err)
         time.sleep(1)
         self._extract_data(doc_str)
예제 #11
0
 def query(query_url):
     sleep(0.2)
     page_text = Pq(get_html(query_url))
     if 'This name is the accepted name of a species in the genus' in page_text('p:eq(0)').text():
         return latin_name
     elif 'This name is a synonym of' in page_text('p:eq(0)').text():
         sci_name = re.search('This name is a synonym of (.*)', page_text('p:eq(0)').text()).group(1)
         return sci_name
     elif 'The results are below' in page_text('p:eq(0)').text():
         lv2_url = 'http://www.theplantlist.org' + page_text('table>tbody>tr:eq(0)>td:eq(0)>a').attr('href')
         sci_name = page_text('table>tbody>tr:eq(0)>td:eq(0)').text()
         if latin_name in sci_name:
             return query(lv2_url)
         else:
             return 'check tbl manually'
     else:
         return 'check NCBI'
예제 #12
0
    def render(self):
        cur_wc = 0
        concept_count = 0
        processed_concepts = {}

        for concept in self._get_next_concept():
            if concept:
                if not concept.has_permission_read(self.user):
                    continue

                render = True
                if not concept.is_summary_crawlable(document=self.document,
                                                    project=self.project):
                    render = False

                attr = concept.get_attr_by_doc(self.document)
                if attr and attr.is_header():
                    render = False
                if attr and attr.is_image():
                    render = False

                if render:
                    phrase = concept.get_phrasing(doc=self.document,
                                                  return_text=False)
                    wc = phrase.get_word_count()
                    if wc + cur_wc > self.word_count:
                        break
                    concept_count += 1
                    cur_wc += wc

                parent = concept.get_parent()
                if not processed_concepts.get(parent.id):
                    processed_concepts[parent.id] = []
                processed_concepts[parent.id].append(concept)

        paragraph_divider = 300
        paragraph_count = cur_wc / paragraph_divider
        if cur_wc % paragraph_divider > 0:
            paragraph_count += 1

        con_pre_par = (concept_count / paragraph_count) + 1
        self.paragraph = Pq('<p></p>')
        self.body.append(self.paragraph)
        self.con_count = 0
        self._render(self.project, con_pre_par, processed_concepts)
        self.html = self.body.html(method='html')
예제 #13
0
파일: main.py 프로젝트: kohihi/177manga
def dl(mid: str, save_path: str):
    """
    下载
    :param mid: str 资源的编号
    :param save_path: str 保存文件的目录
    :return:
    """
    url = "http://www.177pic.info/html/" + str(mid) + ".html"
    s = time.time()  # 计算耗时的时间戳
    e = query_html(url)
    if e[0] == 1:
        e = Pq(e[1])
        title = e('.entry-title').eq(0)
        if title:
            title = title.text()
        else:
            return "资源不存在,请检查是否输入有误"

        print("标题:{}".format(title))
        match_obj = re.search(r'\[(\d+)P]', title)
        a = "未知"
        if match_obj is not None:
            a = match_obj.group(1)
        if a != "未知":
            counter.reset(int(a))
        page_link_list = e('.page-links').find('a')
        page_num = page_link_list.length

        print("开始下载……")
        save_path = os.path.join(save_path, title)
        if not os.path.exists(os.path.join(save_path)):
            os.makedirs(os.path.join(save_path))
        init_threads(page_num, url, save_path)
        d = time.time()
        print("耗时{}S".format(round(d - s, 4)))

        if setting.zip_mode > 1:
            print(zip_file(save_path))
        if setting.zip_mode > 2:
            import shutil
            shutil.rmtree(save_path)
            print("{} 已删除".format(save_path))
        return "Done"
    else:
        print(e[1])
예제 #14
0
 def crawl_daili66(self, page_count=4):
     """
     获取代理66
     :param page_count: 页码
     :return: 代理
     """
     start_url = 'http://www.66ip.cn/{}.html'
     urls = [start_url.format(page) for page in range(1, page_count + 1)]
     for url in urls:
         print('Crawling', url)
         html = get_page(url)
         if html:
             doc = Pq(html)
             trs = doc('.containerbox table tr:gt(0)').items()
             for tr in trs:
                 ip = tr.find('td:nth-child(1)').text()
                 port = tr.find('td:nth-child(2)').text()
                 yield ':'.join([ip, port])
예제 #15
0
    def __init__(self, pro, doc, wc, group, organization=None):
        self.project = pro
        self.document = doc
        self.word_count = wc
        self.groups = [group.key, Group.get_worldshare().key]
        self.organization = organization

        self.user = User()
        self.user.groups = self.groups
        self.walker = ConceptPublishWalker(pro)

        if organization:
            self.user.organization = organization.key

        self.html = ''
        self.body = Pq('<span></span>')
        self.con_count = 0
        self.paragraph = None
예제 #16
0
 def handle_td(k, td):
     # 0. # employed
     # 1. Code
     # 2. Occupation + link !IMPORTANT
     # 3. Project growth - as image
     # 4. Projected openings
     td = Pq(td)
     if k == 0:
         data['num_employed'] = td.text()
     if k == 1:
         data['code'] = td.text()
     if k == 2:
         subdata = {'job': td.text(), 'url': td.find('a').attr('href')}
         data['occupation'] = subdata
     if k == 3:
         data['projected_growth'] = td.find('img').attr('alt')
     if k == 4:
         data['projected_openings'] = td.text()
예제 #17
0
파일: csdn.py 프로젝트: Auraxc/csdn-spider
def save_md(page, name):
    e = Pq(page)
    md = e(".blog-content-box .htmledit_views")
    pic = md("img")

    for p in pic.items():
        pic_path = p.attr("src")
        print(pic_path)
        # TODO: 图片下载和地址替换
        ###

    folder = config.md_folder
    create_folder(folder)
    filename = "{}.md".format(name)
    path = os.path.join(folder, filename)
    if not os.path.exists(path):
        with open(path, 'w', encoding="UTF8") as f:
            f.write(str(md))
예제 #18
0
    def _extract_data(self, doc_str):
        doc = Pq(doc_str)
        li_list = doc('.aside.aside-left>.category-nav.J-category-nav>li')
        for li in li_list:
            self._category_detail["shopType"] = doc(li).attr("data-key")
            self._category_detail["categoryId"] = self._category_detail[
                "shopType"]
            self._category_detail["name"] = doc(li).find(".name>span").text()
            self._category_list.append(copy.copy(self._category_detail))
            # doc2   = Pq(doc_str)
            # div_list = doc2(".aside.aside-left>.category-nav.J-category-nav>li>.secondary-category.J-secondary-category>div>div")
            a_list = doc(li).find("div>a")
            for a in a_list:
                self._category_detail["categoryId"] = doc(a).attr("data-key")
                self._category_detail["name"] = doc(a).text()
                self._category_list.append(copy.copy(self._category_detail))

        self.save_category()
예제 #19
0
def parse_page(json):
    """页面解析函数"""
    if json:
        items = json.get('data').get('cards')
        # 分析json格式发现,偶数元素才包含mblog,所以判断mblog是否存在
        # 再执行下面的操作
        for item in items:
            item = item.get('mblog')
            if item == None:
                pass
            else:
                weibo = {}
                weibo['id'] = item.get('id')
                weibo['text'] = Pq(item.get('text')).text()
                weibo['attitudes'] = item.get('attitudes_count')
                weibo['comments'] = item.get('comments_count')
                weibo['reports'] = item.get('reposts_count')
                yield weibo
예제 #20
0
 def _extract_data(self, doc_str):
     doc = Pq(doc_str)
     #name
     self.__ne_detail["name"] = doc('.mainTitle >h1').text()
     #area
     for li in doc('.newinfo >ul> li'):
         if (doc(li).find(".z").text() == "详细地址:"):
             str = doc(li).text()
             str = str.replace("详细地址: ", "").replace("&nbsp", "").replace(
                 "-", "").replace("  ", " ").replace(" ", ",")
             self.__ne_detail["location"] = str
         if (doc(li).find(".z").text() == "服务区域:"):
             str = doc(li).text()
             str = str.replace("服务区域: ", "").replace("&nbsp", "").replace(
                 "-", "").replace("  ", " ").replace(" ", ",")
             self.__ne_detail["area_name"] = str
     self.__ne_detail["description"] = doc('.description_con >span').text()
     print(self.__ne_detail)
     self._video_dao()
예제 #21
0
def get_latin_name(chinese_name, retry_num=0):
    retry_num += 1
    if retry_num > 3:
        return
    query_url = 'http://www.iplant.cn/info/' + chinese_name
    try:
        a = Pq(get_html(query_url))
        latin_name = a('#sptitlel.infolatin').text()
        if latin_name == '':
            try:
                return re.search('[a-zA-Z\s]+', a('.infomore>a').text()).group(0).strip() + \
                       '\t' + \
                       re.search('[\u4e00-\u9fa5]+', a('.infomore>a').text()).group(0)
            except AttributeError:
                return
        return latin_name
    except TimeoutError:
        sleep(1)
        get_chinese_name(chinese_name, retry_num=retry_num)
예제 #22
0
def fanfou_from_div(div):
    """
    从 div 中获取消息信息
    """
    e = Pq(div)
    # 小作用域变量用单字符
    m = Fanfou()
    # m.name = e('.title').text()
    m.content = e('.content').text()
    m.time = e('.time').attr('stime')
    m.device = e('.method').text()
    m.link = "fanfou.com" + e('.stamp').html().split('"', 2)[1]
    m.pic_link = e('.content a').attr('name')
    if m.pic_link is not None:
        m.pic_link = 'fanfou.com' + m.pic_link
    m.pic = e('.photo').attr('href')
    m.pic = str(m.pic).split('@', 1)[0]
    # log('piclink', m.pic_link)
    return m
예제 #23
0
def url2wordcloud(url,
                  requests_kwargs={},
                  exclude_punct=True,
                  normalized=True,
                  limit=None,
                  size=1,
                  min_len=None):
    """Convert the text content of a urls' html to a wordcloud config.

    Args:
        url (str): The url to load.
        requests_kwargs (dict, optional): The kwargs to pass to the
            requests library. (e.g. auth, headers, mimetypes)
        exclude_punc (bool, optional): exclude punctuation
        min_length (int, optional): the minimum required length, if any
        limit (int, optional): the number of items to limit
            (by most common), if any
        normalized (bool, optional): normalize data by
            lowercasing and strippping whitespace

    Returns:
        same value as :func:`~format_4_wordcloud`
    """
    resp = requests.get(url, **requests_kwargs)
    if not resp.status_code == 200:
        return []
    resp = Pq(resp.content).find('body').text().split(' ')
    if exclude_punct:
        resp = [
            re.sub(r'[^a-zA-Z0-9]+', '', w) for w in resp
            if w not in punctuation
        ]
    if min_len is not None:
        resp = [w for w in resp if len(w) >= min_len]
    if normalized:
        resp = [w.lower() for w in resp]
    words = get_word_freq_distribution(resp)
    if limit is not None:
        words = words.most_common(limit)
    else:
        words = [(k, v) for k, v in words.items()]
    return format_4_wordcloud(words, size_multiplier=size)
예제 #24
0
def IMWarring(imwarringurl):
    get_ck = Login.redlogin("http://devops.lab.everhomes.com/login")
    idlist = [184]
    for id in idlist:
        warurl = imwarringurl.format(id)
        get_page = Login.get_req(warurl)
        doc = Pq(get_page)
        tr_list = doc("#content > form:nth-child(4) > div > table > tbody>tr")
        for tr in tr_list:

            topic = "#" + doc(tr)(" td.subject > a").text() + "#"

            if topic == "##":
                continue
            else:

                tourl = "http://devops.lab.everhomes.com/" + doc(tr)(
                    " td.subject > a").attr("href")
                name = doc(tr)(" td.assigned_to > a").text()
                if name == "":
                    continue
                date = doc(tr)(" td.due_date").text()
                maintopic = "【" + name + "】" + "业务例会重点任务预警" + "—" + topic
                if date == "":
                    content = "warning!!!\n" + '<html><body><p>' + name + ',您主题为 <a style="font-family:verdana;color:3366CC  ;font-size:18px;"  href=' + tourl + '><u>' + topic + '</u></a>的任务目前处于预警状态且未填预期截止时间请及时处理</p></body></html>'
                else:
                    date = datetime.datetime.strptime(date, "%Y-%m-%d")
                    delay = (datetime.datetime.now() - date).days
                    if delay > 0:
                        content = "warning!!!\n" + '<html><body><p>' + name + ',您主题为 <a style="font-family:verdana;color:3366CC  ;font-size:18px;"  href=' + tourl + '><u>' + topic + '</u></a>的任务已延期' + str(
                            delay) + '天请及时处理<p></body></html>'
                    else:
                        content = "warning!!!\n" + '<html><body><p>' + name + ',您主题为 <a style="font-family:verdana;color:3366CC  ;font-size:18px;"  href=' + tourl + '><u>' + topic + '</u></a>的任务距离截止日期还剩' + str(
                            abs(delay)) + '天请及时处理</p></body></html>'
                addr = [mailto_dict[name], mailto_dict["st"]]

                if send_mail(addr, maintopic, content):  #邮件主题和邮件内容
                    print(addr)
                    print("done!")
                    print(content)
                else:
                    print("failed!")
예제 #25
0
def fanfou_from_url(url):
    """
    从 url 下载并解析消息
    """
    page = cached_page(url)
    e = Pq(page)
    items = e('.message li')
    log('消息来自', items)
    # items[0]('title')
    # 调用 movie_from_div
    # __iter__ 迭代器
    f = [fanfou_from_div(i) for i in items]
    for i in f:
        f = i.__dict__
        save.SQLsave(f)
        if f['pic'] != 'None':
            pic = f['pic']
            log('保存照片', pic)
            save_pic(pic)

    return Fanfou
예제 #26
0
def get_products():
    try:
        wait_.until(
            ec.presence_of_element_located(
                (By.CSS_SELECTOR, '#mainsrp-itemlist .items .item')))
        html = browser.page_source
        doc = Pq(html)
        items = doc('#mainsrp-itemlist .items .item').items()
        for item in items:
            product = {
                'title': item.find('.title').text(),
                'price': item.find('.price').text(),
                'pay': item.find('.deal-cnt').text(),
                'shop': item.find('.shop').text(),
                'location': item.find('.location').text(),
                'image': item.find('.pic .img').attr('src'),
            }
            save_to_mongodb(product)
            print(product)
    except exceptions.TimeoutException:
        return get_products()
예제 #27
0
def search_youtube_video(title, pages):
    print("Entramos en la busqueda")
    cont = 0
    lista_url = []
    lista_views = []
    for page in range(pages):
        params = urllib.parse.urlencode({
            'search_query':
            'intitle:"%s", video' % title,
            'page':
            page
        })
        jq = Pq(
            url="http://www.youtube.com/results?%s" % params,
            headers={
                "user-agent":
                "Mozilla/5.0 (Windows NT 6.1; rv:24.0) Gecko/20140129 Firefox/24.0"
            })
        jq.make_links_absolute("http://www.youtube.com")
        for video in jq("ol.item-section").children().items():
            url = video.find("a.yt-uix-tile-link").attr("href")
            lista_url.append(url)
            views = video.find("ul.yt-lockup-meta-info li").eq(1).html()
            if views is not None:
                res = int(
                    views.split('visualizaciones')[0].strip().replace('.', ''))
            else:
                res = 0
            lista_views.append(res)

            cont = cont + 1
            if cont == 8:
                indice = lista_views.index(max(lista_views))
                print("views: {} ".format(max(lista_views)))
                print("indice: {}".format(indice))
                print("url: " + lista_url[indice])
                return lista_url[indice]

    indice = lista_views.index(max(lista_views))
    return lista_url[indice]
    def redlogin(loginurl):

        cj = http.cookiejar.LWPCookieJar()
        cookie_support = urllib.request.HTTPCookieProcessor(cj)
        opener = urllib.request.build_opener(cookie_support,
                                             urllib.request.HTTPHandler)
        urllib.request.install_opener(opener)
        h = urllib.request.urlopen(loginurl)
        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0'
        }
        #伪装浏览器
        req = urllib.request.Request(url=loginurl, headers=headers)
        #构造请求
        urllib.request.install_opener(opener)
        m_fp = urllib.request.urlopen(req, timeout=500)
        #访问网站获取源码
        html_str = m_fp.read().decode('utf-8')
        #读取源码,该网站使用的编码方式是utf-8
        doc = Pq(html_str)
        authenticity_token = doc("head > meta:nth-child(8)").attr("content")
        print("authenticity_token=: " + authenticity_token)

        headers = {
            'User-Agent':
            'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:23.0) Gecko/20100101 Firefox/23.0',
            'Referer': "http://devops.lab.everhomes.com/login",
        }
        values = {
            "authenticity_token": authenticity_token,
            "username": "******",
            "password": "******"
        }
        data = urllib.parse.urlencode(values).encode('utf-8')

        req = urllib.request.Request(url=loginurl, headers=headers, data=data)
        urllib.request.install_opener(opener)
        m_fp = urllib.request.urlopen(req)
    def _extract_data2(self, doc_str):
        doc = Pq(doc_str)
        li_list = doc(".mainListing.clearfix>.pL>.list>li")
        for li in li_list:
            self._community_detail["url"] = doc(li).find(
                ".details>div>a").attr("href")
            self._community_detail["name"] = doc(li).find(
                ".details>div>a").text()
            p = doc(li).find(".details>p")
            self._community_detail["location"] = doc(p[0]).text()
            self._community_detail["area_name"] = self._community_detail[
                "location"][self._community_detail["location"].index("[") +
                            1:self._community_detail["location"].index("]")]
            self._community_detail["location"] = self._community_detail[
                "location"][self._community_detail["location"].index("]") + 1:]

            url = doc(li).find(".details>.p_links>a").attr("href")
            self._community_detail['latitude'] = url[url.index("l1=") +
                                                     3:url.index("&l2")]
            self._community_detail['longitude'] = url[url.index("l2=") +
                                                      3:url.index("&l3")]
            self._save_community()
예제 #30
0
파일: csdn.py 프로젝트: Auraxc/csdn-spider
def cached_page(url):
    """
    保存缓存页面
    """
    page_dic = {}
    filename = '{}.html'.format(url.split('/')[-1])
    page = get_page(url, filename)
    e = Pq(page)
    # tmp = e('.article-list .article-item-box.csdn-tracking-statistics .h4')
    # print(tmp)
    items = e('.article-list .article-item-box.csdn-tracking-statistics').items()
    for i in items:
        k = i.attr("data-articleid")
        v = str(i("h4")("a").text())
        page_dic[k] = v
    for p in page_dic.items():
        url = "https://blog.csdn.net/{}/article/details/{}".format(config.author, p[0])
        print(p[0], p[1])
        page = get_page(url, "{}.html".format(p[0]))
        save_md(page, p[0])
    # items = items.children(".article-item-box.csdn-tracking-statistics")
    # print(items)
    return page