Python Selector 예제들, parsel.Selector Python 예제들

예제 #1

0

파일 보기

파일: test_web.py 프로젝트: javipalanca/spade

async def test_stop(test_client):
    agent = Agent("jid@server", "password")
    agent.web.setup_routes()
    client = await test_client(agent.web.app)

    response = await client.get("/spade/stop")
    response = await response.text()

    sel = Selector(text=response)
    assert sel.css("div.alert-warning > span::text").get().strip() == "Agent is stopping now."

    with LogCapture() as log:
        try:
            await client.get("/spade/stop/now/", timeout=0.0005)
        except requests.exceptions.ReadTimeout:
            pass

        log.check_present(('spade.Web', 'WARNING', "Stopping agent from web interface."))

    counter = 5
    while agent.is_alive() and counter > 0:
        counter -= 0.5
        time.sleep(0.5)

    assert not agent.is_alive()

예제 #2

0

파일 보기

파일: test_xpathfuncs.py 프로젝트: scrapy/parsel

 def test_has_class_tab(self):
     body = u"""
     <p CLASS="foo\tbar">First</p>
     """
     sel = Selector(text=body)
     self.assertEqual(
         [x.extract() for x in sel.xpath(u'//p[has-class("foo")]/text()')],
         [u'First'])

예제 #3

0

파일 보기

파일: style_similarity.py 프로젝트: Expertasif/html-similarity

def get_classes(html):
    doc = Selector(text=html)
    classes = set(doc.xpath('//*[@class]/@class').extract())
    result = set()
    for cls in classes:
        for _cls in cls.split():
            result.add(_cls)
    return result

예제 #4

0

파일 보기

파일: huaban.py 프로젝트: LingjianShi/pachong

def getimgsrc(pin_id):
    url = 'http://huaban.com/pins/%s/' % pin_id
    z = requests.get(url,headers={'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'})
    sel = Selector(text=z.text)
    jscode = sel.xpath("//script[contains(., 'app.page = app.page')]/text()").extract_first()
    parsed_js = js2xml.parse(jscode)
    for i in parsed_js.xpath('//property[@name="pins"]//property[@name="key"]/string/text()'):
        print 'http://img.hb.aicdn.com/' + i

예제 #5

0

파일 보기

파일: natgeo.py 프로젝트: Granitas/wallme

 def get_categories(self, response=None):
     if not response:
         response = requests.get(self.url)
     sel = Selector(text=response.text)
     categories = sel.xpath("//select[@id='search_category']"
                            "/option/text()").extract()
     categories = [c.split(' by ')[0].replace(' & ', '-')
                   for c in categories]
     return categories

예제 #6

0

파일 보기

파일: zhihulogin.py 프로젝트: q7695650/pachong

def getheaders():
    '从网页源代码内解析出 uuid与Xsrftoken'
    z1 = s.get('https://www.zhihu.com/')
    sel = Selector(z1.text)
    jsdata = sel.css('div#data::attr(data-state)').extract_first()
    xudid = json.loads(jsdata)['token']['xUDID']
    xsrf = json.loads(jsdata)['token']['xsrf']
    headers = headers_raw_to_dict(post_headers_raw)
    headers['X-UDID'] = xudid
    headers['X-Xsrftoken'] = xsrf
    return headers

예제 #7

0

파일 보기

파일: mangafox.py 프로젝트: italomaia/mangafox

def load_chapters(url):
    """
    Loads all chapters from a manga comic and returns a list for dictionaries
    with related data.

    :return: chapter list in asc order
    """
    text = requests.get(url).text
    sel = Selector(text)
    hel_gen = sel.css(".chlist h3, .chlist h4")
    chapter_gen = map(hel_to_chapter, hel_gen)
    available_chapter_gen = filter(lambda v: v['title'], chapter_gen)
    return reversed(list(available_chapter_gen))

예제 #8

0

파일 보기

파일: matchticker.py 프로젝트: Granitas/gosuticker

 def update_match_streams(self, matches: List[Match]) -> List[Match]:
     """Populate Match objects with stream urls"""
     updated = []
     for item in matches:
         # Populate stream data if match is live
         if not item['time_secs']:
             resp = self.session.get(item['url'])
             sel_detailed = Selector(text=resp.text)
             item['stream'] = sel_detailed.xpath("//div[@class='matches-streams']"
                                                 "/span[.//a[re:test(text(),'english', 'i')]]"
                                                 "//iframe/@src").extract_first()
             item['stream'] = clean_stream_url(item['stream'])
         updated.append(item)
     return updated

예제 #9

0

파일 보기

파일: test_web.py 프로젝트: javipalanca/spade

async def test_add_get(test_client):
    agent = Agent("jid@server", "password")
    agent.web.add_get("/test", lambda request: {"number": 42}, "examples/hello.html")

    agent.web.setup_routes()
    client = await test_client(agent.web.app)

    response = await client.get("/test")
    response = await response.text()

    sel = Selector(text=response)
    assert sel.css("h1::text").get().strip() == "42"

    agent.stop()

예제 #10

0

파일 보기

파일: cli.py 프로젝트: eliasdorneles/parsel-cli

def main(argv=None, progname=None):
    parser = argparse.ArgumentParser(prog=progname, description=__doc__)
    parser.add_argument('expr', metavar='EXPRESSION',
                        help="A CSSexpression, or a XPath expression if --xpath is given.")
    parser.add_argument('file', metavar='FILE', nargs='?',
                        help="If missing, it reads the HTML content from the standard input.")
    parser.add_argument('--xpath', action='store_true',
                        help="Given expression is a XPath expression.")
    parser.add_argument('--re', metavar='PATTERN',
                        help="Apply given regular expression.")
    parser.add_argument('--encoding', metavar='ENCODING', default='utf-8',
                        help="Input encoding. Default: utf-8.")
    parser.add_argument('--repr', action='store_true',
                        help="Output result object representation instead of as text.")
    # TODO: Output this and parsel version.

    args = parser.parse_args(argv)

    if args.file:
        text = open(args.file).read()
    else:
        text = sys.stdin.read()

    if isinstance(text, six.binary_type):
        try:
            text = text.decode(args.encoding)
        except UnicodeDecodeError:
            parser.error("Failed to decode input using encoding: %s" % args.encoding)

    sel = Selector(text=text)

    if args.xpath:
        result = sel.xpath(args.expr)
    else:
        result = sel.css(args.expr)

    if args.re:
        regex = args.re.encode(args.encoding)
        regex = regex.decode('string_escape' if six.PY2 else 'unicode_escape')
        out = result.re(re.compile(regex, re.IGNORECASE | re.UNICODE))
    else:
        out = result.extract()

    if args.repr:
        pprint.pprint(out)
    else:
        print("\n".join(out))

    return 0

예제 #11

0

파일 보기

파일: bag_of_words.py 프로젝트: piatra/ssl-project

def get_alexa_demographics(url, db_session=False):
    if db_session is not False:
        result = list(db_session.query(WebsitesCache).filter_by(link=url))
        if len(result) > 0 and result[0].male_ratio_alexa >= 0:
            return float(result[0].male_ratio_alexa), float(result[0].female_ratio_alexa)
        else:
            return 0.0, 0.0

    orig_url = url
    url = "http://www.alexa.com/siteinfo/" + url
    response = requests.get(url)

    # We need the decode part because Selector expects unicode.
    selector = Selector(response.content.decode('utf-8'))
    bars = selector.css("#demographics-content .demo-col1 .pybar-bg")
    values = []
    for bar in bars:
        value = bar.css("span::attr(style)").extract()[0]
        value = int(re.search(r'\d+', value).group())
        values.append(value)

    male_ratio = 0.0
    female_ratio = 0.0
    if sum(values) == 0:
        print "No alexa rating for " + url
    else:
        male_ratio = float(values[0] + values[1]) / sum(values)
        female_ratio = float(values[2] + values[3]) / sum(values)
        print url
        print values
        print male_ratio, female_ratio

    # Do we want to cache the result?
    if db_session is not False:
        try:
            db_session.query(WebsitesCache).filter(WebsitesCache.link==orig_url) \
                      .update({
                          'male_ratio_alexa': male_ratio,
                          'female_ratio_alexa': female_ratio
                       })
            db_session.commit()
        except:
            print "Could not update " + url

    return male_ratio, female_ratio

예제 #12

0

파일 보기

파일: test_web.py 프로젝트: javipalanca/spade

async def test_request_home(test_client):
    agent = make_connected_agent("jid@server", "password")
    future = agent.start(auto_register=False)
    future.result()
    agent.web.setup_routes()
    client = await test_client(agent.web.app)

    response = await client.get("/spade")
    response = await response.text()

    sel = Selector(text=response)

    assert sel.css("title::text").get() == "jid agent"
    assert sel.css("img::attr(src)").get() == agent.avatar

    assert sel.css("ul.products-list > li").getall() == []

    agent.stop()

예제 #13

0

파일 보기

파일: test_web.py 프로젝트: javipalanca/spade

async def test_get_messages(test_client):
    agent = Agent("jid@server", "password")
    agent.web.setup_routes()
    client = await test_client(agent.web.app)

    # add messages to trace
    for i in range(5):
        msg = Message(body=str(i), sender="{}@server".format(i), to="receiver@server")
        agent.traces.append(msg)

    response = await client.get("/spade/messages/")
    response = await response.text()

    sel = Selector(text=response)

    assert len(sel.css("ul.timeline > li").getall()) == 6  # num messages + end clock

    agent.stop()

예제 #14

0

파일 보기

파일: check.py 프로젝트: bpoti001/Sentiment-Analysis-on-Financial-News-Articles

def mon(inputs):
    week=[]
    errored_out=[]
    for month in inputs:
        try:data = urllib.request.urlopen(month).read()
        except urllib.error.URLError as e:
            print (month)
            errored_out.append(month)
            print(e.reason)
        if type(data) is bytes:
            data = data.decode("utf-8")      
            hxs = Selector(text=data)
            weeks = hxs.xpath('//ul[@class="weeks"]/li/a').re('http://www.businessweek.com/archive/\\d+-\\d+/news/day\\d+\.html')
            week.append(weeks)
        else:
            hxs = Selector(text=data)
            weeks = hxs.xpath('//ul[@class="weeks"]/li/a').re('http://www.businessweek.com/archive/\\d+-\\d+/news/day\\d+\.html')
            week.append(weeks)
    return week

예제 #15

0

파일 보기

파일: check.py 프로젝트: bpoti001/Sentiment-Analysis-on-Financial-News-Articles

def post(inputs):
    posted=[]
    failed=[]
    for week in inputs:
        try:data = urllib.request.urlopen(week).read()
        except urllib.error.URLError as e:
            failed.append(week)
            print(week)
            print(e.reason) 
        if type(data) is bytes:
            data = data.decode("utf-8") 
            hxs = Selector(text=data)
            posts = hxs.xpath('//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href').extract()
            posted.append(posts)
        else:
            hxs = Selector(text=data)
            posts = hxs.xpath('//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href').extract()
            posted.append(posts)
    return posted

예제 #16

0

파일 보기

파일: mangafox.py 프로젝트: italomaia/mangafox

def download_chapter(chapter, folder_name):
    """
    Grabs all images from a chapter and writes them down to filesystem.

    """

    folder_name = werkzeug.utils.secure_filename(folder_name)

    # if the folder does not exist ...
    if not os.path.exists(folder_name):
        os.mkdir(folder_name)

    text = requests.get(chapter['href']).text
    sel = Selector(text)

    for value in sel.css("select[class='m'] > option::attr(value)").extract():
        value = int(value)
        url = re.sub(r'\d+\.html', '%d.html' % value, chapter['href'])
        download_page(url, folder_name)

예제 #17

0

파일 보기

파일: test_web.py 프로젝트: javipalanca/spade

async def test_get_behaviour(test_client):
    class EmptyOneShotBehaviour(OneShotBehaviour):
        async def run(self):
            self.kill()

    agent = Agent("jid@server", "password")
    behaviour = EmptyOneShotBehaviour()
    agent.add_behaviour(behaviour)
    agent.web.setup_routes()

    client = await test_client(agent.web.app)

    response = await client.get("/spade/behaviour/OneShotBehaviour/EmptyOneShotBehaviour/")
    response = await response.text()

    sel = Selector(text=response)

    assert sel.css("section.content-header > h1::text").get().strip() == "OneShotBehaviour/EmptyOneShotBehaviour"
    agent.stop()

예제 #18

0

파일 보기

파일: test_web.py 프로젝트: javipalanca/spade

async def test_add_post(test_client):
    agent = Agent("jid@server", "password")

    async def handle_post(request):
        form = await request.post()
        number = form["number"]
        return {"number": number}

    agent.web.add_post("/test", handle_post, "examples/hello.html")
    agent.web.setup_routes()
    client = await test_client(agent.web.app)

    response = await client.post("/test", data={"number": 1024})
    response = await response.text()

    sel = Selector(text=response)
    assert sel.css("h1::text").get() == "1024"

    agent.stop()

예제 #19

0

파일 보기

파일: mangafox.py 프로젝트: italomaia/mangafox

def download_page(url, folder_name):
    text = requests.get(url).text
    sel = Selector(text)

    for src in sel.css("img[id='image']::attr(src)").extract():
        basename = os.path.basename(src)
        safe_basename = werkzeug.utils.secure_filename(basename)
        filename = os.path.join(folder_name, safe_basename)
        filename = os.path.abspath(filename)

        # file is not there or has a invalid size ...
        if not os.path.exists(filename) or os.path.getsize(filename) == 0:
            data = requests.get(src).content

            with open(filename, 'wb') as file:
                file.write(data)

            print('{0} written.'.format(filename))
        else:
            print('{0} exists. Skipping.'.format(filename))

예제 #20

0

파일 보기

파일: test_xpathfuncs.py 프로젝트: scrapy/parsel

 def test_has_class_simple(self):
     body = u"""
     <p class="foo bar-baz">First</p>
     <p class="foo">Second</p>
     <p class="bar">Third</p>
     <p>Fourth</p>
     """
     sel = Selector(text=body)
     self.assertEqual(
         [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')],
         [u'First', u'Second'])
     self.assertEqual(
         [x.extract() for x in sel.xpath('//p[has-class("bar")]/text()')],
         [u'Third'])
     self.assertEqual(
         [x.extract() for x in sel.xpath('//p[has-class("foo","bar")]/text()')],
         [])
     self.assertEqual(
         [x.extract() for x in sel.xpath('//p[has-class("foo","bar-baz")]/text()')],
         [u'First'])

예제 #21

0

파일 보기

파일: wallhaven.py 프로젝트: Granitas/wallme

 def download(self, **kwargs):
     """
     Download and set image from wallhaven.cc
     :param position - position of image to choose from listed from 1 to 24,
     default is 0 = random.
     :param categories - categories to download from in 000 format, where every number
     represents binary for [general, anime, people] list.
     :param purity - purity of content in 000 format, where every number
     represents binary for [sfw, sketchy, _].
     :param sorting - sorting type from available see WallhavenDownloader.sorting_types .
     """
     # Make url from arguments
     order = 'desc'
     categories = kwargs.get('categories', '')
     purity = kwargs.get('purity', '')
     sorting = kwargs.get('sorting', '')
     page, position, rand = self._make_position(kwargs.get('position', 0))
     url = self.base_url
     for arg in ['categories', 'purity', 'sorting', 'order', 'page']:
         value = locals()[arg]
         if value:
             url = add_or_replace_parameter(url, arg, locals()[arg])
     # Download and parse items
     resp = requests.get(url)
     if resp.status_code != 200:
         self.logger.error('Failed to download image list {}'.format(resp.url))
         return
     list_sel = Selector(text=resp.text)
     items = list_sel.xpath("//section[@class='thumb-listing-page']//figure/a/@href").extract()
     item = random.choice(items) if rand else items[position - 1]
     resp = requests.get(item)
     if resp.status_code != 200:
         self.logger.error('Failed to download image page {}'.format(resp.url))
         return
     sel = Selector(text=resp.text)
     image_url = sel.xpath("//img[@id='wallpaper']/@src").extract_first()
     meta = {
         'id': sel.xpath("//img[@id='wallpaper']/@data-wallpaper-id").extract_first(),
         'tags': sel.xpath("//ul[@id='tags']//li/a/text()").extract(),
         'views': sel.xpath("//dt[contains(text(),'Views')]/following-sibling::dd[1]/text()").extract_first(),
         'favorites': sel.xpath("//dt[contains(text(),'Favorites')]"
                                "/following-sibling::dd[1]//text()").extract_first(),
         'res': sel.xpath("//h3/text()").extract_first(),
     }
     image = Image(image_url, meta)
     return self.process_url(image, kwargs)

예제 #22

0

파일 보기

파일: matchticker.py 프로젝트: Granitas/gosuticker

 def _find_match(self, sel: Selector) -> Match:
     xpath = lambda x: sel.xpath(x).extract_first(default='').strip()
     item = Match()
     item['url'] = urljoin(self.url_base, xpath(".//a/@href"))
     item['id'] = (re.findall('matches/(\d+)', item['url']) or [None])[0]
     item['game'] = next((g for g in self.games if g in item['url'].lower()))
     item['time'] = xpath("td[@class='status']/span/text()")
     item['time_secs'] = time_to_seconds(item['time'])
     item['timestamp'] = int((datetime.now() + timedelta(item['time_secs'])).timestamp())
     item['t1'] = xpath(".//span[contains(@class,'opp1')]/span/text()")
     item['t1_country'] = xpath(".//span[contains(@class,'opp1')]/span[contains(@class,'flag')]/@title")
     item['t1_country_short'] = xpath(".//span[contains(@class,'opp1')]"
                                      "/span[contains(@class,'flag')]/@class").split()[-1]
     item['t2'] = xpath(".//span[contains(@class,'opp2')]/span/text()")
     item['t2_country'] = xpath(".//span[contains(@class,'opp2')]/span[contains(@class,'flag')]/@title")
     item['t2_country_short'] = xpath(".//span[contains(@class,'opp2')]"
                                      "/span[contains(@class,'flag')]/@class").split()[-1]
     scores = sel.css('.score::text').extract()
     item['t1_score'] = scores[0] if scores else None
     item['t2_score'] = scores[1] if len(scores) > 1 else None
     return item

예제 #23

0

파일 보기

파일: test_web.py 프로젝트: javipalanca/spade

async def test_get_agent(test_client):
    agent = make_presence_connected_agent("jid@server", "password")
    future = agent.start(auto_register=False)
    future.result()

    agent.web.setup_routes()
    client = await test_client(agent.web.app)

    jid = "friend@server"
    item = Item(jid=JID.fromstr(jid))

    agent.presence.roster._update_entry(item)

    response = await client.get(f"/spade/agent/{jid}/")
    response = await response.text()

    sel = Selector(text=response)

    assert sel.css("section.content-header > h1::text").get().strip() == jid

    agent.stop()

예제 #24

0

파일 보기

파일: test_xpathfuncs.py 프로젝트: scrapy/parsel

    def test_set_xpathfunc(self):

        def myfunc(ctx):
            myfunc.call_count += 1

        myfunc.call_count = 0

        body = u"""
        <p CLASS="foo">First</p>
        """
        sel = Selector(text=body)
        self.assertRaisesRegexp(
            ValueError, 'Unregistered function in myfunc',
            sel.xpath, 'myfunc()')

        set_xpathfunc('myfunc', myfunc)
        sel.xpath('myfunc()')
        self.assertEqual(myfunc.call_count, 1)

        set_xpathfunc('myfunc', None)
        self.assertRaisesRegexp(
            ValueError, 'Unregistered function in myfunc',
            sel.xpath, 'myfunc()')

예제 #25

0

파일 보기

def ruleDetailEnShiShi(jsonData, detailUrl):
    for dat in jsonData.get('list'):
        # 发布时间
        bulletinIssueTime = dat.get('bulletinIssueTime')
        bulletinName = dat.get('bulletinName')

        # 获取表格中的所有数据 转成html
        bulletinContent = dat.get('bulletincontent')
        selectors = Selector(text=bulletinContent)

        # 标准的格式 类似这种: https://www.hbggzyfwpt.cn/jyxx/jsgcZbjggsDetail?guid=804c6b5b-e69f-4118-8d38-9c9d9413eb65&isOther=false
        # 招标人
        zhaoBiaoRen = selectors.xpath(
            "//div[text()='招标人或招标代理机构：']/../following-sibling::td[1]/div/text()"
        ).extract_first()
        # 地址 根据招标人定位地址 也可以根据地址文字定位
        diZHi = selectors.xpath(
            "//div[text()='地址：']/../following-sibling::td[1]/div/text()").get(
                default='')
        # diZHi = selectors.xpath("//td[contains(text(),'地址：')]/text()").getall()[0]
        # 中标人
        zhongBiaoRen = selectors.xpath(
            "//div[text()='中标人']/../../following-sibling::tr[1]/td[2]//text()"
        ).get(default='')
        # 中标价
        zhongBiaoJia = selectors.xpath(
            "//div[text()='中标人']/../../following-sibling::tr[1]/td[3]//text()"
        ).get(default='')

        items = {
            'bulletinIssueTime': bulletinIssueTime,
            'zhaoBiaoRen': zhaoBiaoRen,
            'bulletinName': bulletinName,
            'diZHi': diZHi,
            'zhongBiaoRen': zhongBiaoRen,
            'zhongBiaoJia': zhongBiaoJia,
            'detailUrl': detailUrl,
            'city': '恩施市'
        }

        return items

예제 #26

0

파일 보기

파일: NoticiasColetor.py 프로젝트: silviosmf/webmercado

def carregarG1():
    noticias = []
    try:
        html = urlopen("http://g1.globo.com/economia/ultimas-noticias.html",
                       timeout=600000).read()
    except URLError as e:
        print(e)
        print("Falha na conexão com G1")
        return []
    retorno = str(html.decode("utf-8"))

    #Obtém as informações do Html da página
    sel = Selector(text=retorno)
    titulos = sel.css('.feed-post-body').css('.feed-post-body-title').css(
        '.feed-post-link').xpath('.//text()').getall()
    links = sel.css('.feed-post-body').css('.feed-post-body-title').css(
        '.feed-post-link').xpath('.//@href').getall()
    resumos = sel.css('.feed-post-body').css('.feed-post-body-resumo').xpath(
        './/text()').getall()
    categorias = sel.css('.feed-post-body').css('.feed-post-metadata').css(
        '.feed-post-metadata-section').xpath('.//text()').getall()
    datas = sel.css('.feed-post-body').css('.feed-post-metadata').css(
        '.feed-post-datetime').xpath('.//text()').getall()

    #Monta a lista de notícias
    conteudo = []
    for indice in range(len(titulos)):
        conteudo.append({
            "fonte": "G1",
            "titulo": "" + titulos[indice] + "",
            "link": "" + links[indice] + "",
            "resumo": "" + resumos[indice] + "",
            "categoria": "" + categorias[indice] + "",
            "data": "" + datas[indice] + ""
        })
    return conteudo

예제 #27

0

파일 보기

def extract_job(driver, url) -> Job:
    driver.get(url)
    sel = Selector(text=driver.page_source)

    job_id = int(url.split('/')[5])
    job_url = url
    company_name = str(
        sel.xpath('//a[@data-control-name="company_link"]/text()')
        [2].get()).strip()
    company_url = str(
        sel.xpath('//a[@data-control-name="company_link"]/@href')[0].get())

    element = sel.xpath('//span[@class="jobs-top-card__bullet"]/text()')
    if not element:
        element = sel.xpath(
            '//a[@data-control-name="commute_module_anchor"]/text()')
    address = str(element[0].get()).strip()

    description = ",".join(
        sel.xpath('//div[@id="job-details"]//descendant::*/text()').extract()
    ).strip()

    return Job(job_id, job_url, company_name, company_url, address,
               description)

예제 #28

0

파일 보기

def parse_detail(url):
    headers = {
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
    }
    response = requests.get(url, headers=headers).text
    sel = Selector(text=response)
    title = sel.xpath(
        "normalize-space(//div[@class='title-box']/h1[@class='title J_title']/text())"
    ).extract()[0]
    content_a = sel.xpath(
        "//article[@class='wrap-main']/div[@id='feed-main']/div[@class='item-name']/article[@class='txt-detail']/*[name(.)!='span']"
    )
    dd = ''
    for content in content_a:
        content_temp = content.xpath(
            'normalize-space(.)').extract_first().replace("'", '')
        if "极速发" in content_temp or "查看点评" in content_temp or "中立的消费门户" in content_temp or "小值机器人" in content_temp:
            continue
        dd += content_temp + '\n'
    links = sel.xpath(
        "//article[@class='txt-detail']/p/a[@itemprop='description']/@href"
    ).getall()
    text_links = sel.xpath(
        "//article[@class='txt-detail']/p/a[@itemprop='description']/text()"
    ).getall()
    temp_links = []
    temp_text_links = []
    shop_num = re.search(r'-?\d+', str(url), re.M | re.I).group()
    print(shop_num)
    if links and text_links:
        for link in links:
            if len(text_links) > 0:
                text_link = text_links.pop()
                if text_link in dd:
                    if 'www.smzdm.com/p' in link and link:
                        temp_links.append(parse_link(link))
                    if 'go.smzdm.com' in link:
                        temp_links.append(link)
                    temp_text_links.append(text_link)
    return (title, dd, temp_links, temp_text_links, shop_num)

예제 #29

0

파일 보기

def get_page_data(url):
    data = get_data(url)  # 获取数据
    se = Selector(text=data)

    poster = se.css(".grid_view .pic img").xpath("./@src").extract()  # 获取海报
    movie_name = se.css(".grid_view .pic img").xpath("./@alt").re(
        "\w+")  # 获取电影名
    tmp_actor_o_info = se.css(".grid_view .info .bd").xpath(
        "./p/text()")  # 获取上映日期和种类
    actor, other_info = tmp_actor_o_info[::4], tmp_actor_o_info[1::4]
    actor, other_info = [i.get() for i in actor], [i.get() for i in other_info]
    score = se.css(".star>.rating_num::text").extract()  # 获取评分
    movie_data = {}

    for p, m, a, o, s in zip(poster, movie_name, actor, other_info, score):
        a = a.strip()
        try:
            other_info = o.strip().replace("\xa0", " ")
            if "导演:" in a:
                director, a = a[4:].split("   ")
            else:
                director, a = a, a
            director = director.strip().split("/")  # 列表
            year, country, movie_type = other_info.strip().split("/")

            movie_type = movie_type.strip().split(" ")  # 列表
            movie_data.update({
                m: {
                    "poster": p,
                    "movie_name": m,
                    "director": "/".join(director),
                    "movie_type": "/".join(movie_type),
                    "score": s,
                    "country": country.strip(),
                    "year": int(year)
                }
            })
        except ValueError:
            continue
    return movie_data

예제 #30

0

파일 보기

 def get_url(self, list_fname):
     r"""
     解析list列表页面，将解析出的数据存入mysql
     """
     with open(list_fname, encoding='utf8') as f:
         text = f.read()
     html = Selector(text, type='html')
     result = html.xpath(
         "//h5[@class='issue-item__title']/a/@href").getall()
     # time.sleep(120)
     titles = html.xpath(
         "//h5[@class='issue-item__title']/a/text()").getall()
     eISBN = html.xpath(
         "//div[@class='teaser__row'][2]/div[@class='teaser__item']/text()"
     ).getall()
     if eISBN != []:
         if "eISBN" in eISBN[0]:
             eisbn = eISBN[0].replace("eISBN:", "").replace("-", "")
     else:
         eisbn = ""
     if result is None:
         return
     else:
         for i, item in enumerate(result):
             lists = []
             self.i += 1
             url = 'https://arc.aiaa.org' + item
             name = re.findall('10.2514/(.*)', item)[0]
             title = titles[i]
             try:
                 session = html.xpath(
                     "//div[@class='issue-item__session_details']/text()"
                 ).getall()[i]
             except:
                 session = ""
             message = (url, name, title, session, eisbn)
             sql = "insert ignore into detail (url,doi,title,session,eisbn) values(%s,%s,%s,%s,%s)"
             cur = self.conn.cursor()
             cur.execute(sql, message)
             self.conn.commit()

예제 #31

0

파일 보기

 def down_engin(self, message):
     url = 'http://engineering.org.cn/EN/2095-8099/current.shtml'
     feature = 'txt_biaoti'
     fdir = self.list_path + '/' + 'engi'
     if not os.path.exists(fdir):
         os.makedirs(fdir)
     fname = '%s/engi_current.html' % fdir
     while True:
         resp = self.gethtml(url, feature)
         if resp:
             break
     selcover = Selector(text=resp.content.decode('utf8'))
     tdTag = selcover.xpath('//td[@class="img_display"]')[0]
     engicoverurl = tdTag.xpath('./img/@src').extract_first()
     engicoverurl = engicoverurl.replace('../..', '')
     with open(fname, mode='w', encoding='utf8') as f:
         f.write(resp.content.decode('utf8'))
     url = 'http://engineering.org.cn/EN/article/showOldVolumnList.do'
     feature = 'Current Issue'
     while True:
         resp = self.gethtml(url, feature)
         if resp:
             break
     sel = Selector(text=resp.content.decode('utf8'))
     conn = utils.init_db('mysql', 'hepjournal', 4)
     cur = conn.cursor()
     cur.execute(
         "update journal set cover_url='%s' where journal_id='engi'" %
         engicoverurl)
     for aTag in sel.xpath('//a[contains(@href, "../volumn/volumn")]'):
         url = 'http://engineering.org.cn/EN' + aTag.xpath(
             './@href').extract_first().replace('..', '')
         sql = "insert ignore into issue(url,journal_id) Values('%s','%s')" % (
             url, 'engi')
         cur.execute(sql)
     conn.commit()
     cur.close()
     conn.close()
     self.senddistributefinish('startdown_list')

예제 #32

0

파일 보기

 def test_has_class_simple(self):
     body = u"""
     <p class="foo bar-baz">First</p>
     <p class="foo">Second</p>
     <p class="bar">Third</p>
     <p>Fourth</p>
     """
     sel = Selector(text=body)
     self.assertEqual(
         [x.extract() for x in sel.xpath('//p[has-class("foo")]/text()')],
         [u'First', u'Second'])
     self.assertEqual(
         [x.extract() for x in sel.xpath('//p[has-class("bar")]/text()')],
         [u'Third'])
     self.assertEqual([
         x.extract()
         for x in sel.xpath('//p[has-class("foo","bar")]/text()')
     ], [])
     self.assertEqual([
         x.extract()
         for x in sel.xpath('//p[has-class("foo","bar-baz")]/text()')
     ], [u'First'])

예제 #33

0

파일 보기

def get_hous_detail(start_url_content):
    try:
        price = re.findall('"comm_midprice":"(.*?)","area_midprice"',start_url_content,re.S)[0]
    except:
        price = re.findall('"comm_midprice":(.*?),"area_midprice"',start_url_content,re.S)[0]
    print(price)
    l2 = re.findall('lat : "(.*?)",.*?lng : "(.*?)"',start_url_content,re.S)
    lat_lng= [float(l2[0][0]), float(l2[0][1])]
    print(lat_lng)
    html = Selector(text=start_url_content)
    detali_dt = html.xpath('//*[@id="basic-infos-box"]/dl/dt')
    address = html.xpath('//span[@class="sub-hd"]/text()').extract_first()
    all_add = html.xpath('//div[@class="p_1180 p_crumbs"]/a/text()').extract()
    city = all_add[1].replace('小区','')
    county = all_add[2]
    community = all_add[3]
    community_name = all_add[4]
    pin = Pinyin()
    province = gen_address(city)
    sheet_name = pin.get_pinyin(province, "").replace('sheng', '').replace('shi', '')
    print(province,city,county,community,community_name)
    print(address)
    dt = []
    for i in detali_dt:
        key1 = i.xpath('./text()').extract_first().replace('\xa0','').replace('：','')
        key = ho.get(key1)
        dt.append(key)
        # print('{}{}'.format(i.xpath('./dt/text()').extract_first(),i.xpath('./dd/text()').extract_first()))
        # print('{}'.format(i.xpath('./text()').extract_first()))
    detali_dd = html.xpath('//*[@id="basic-infos-box"]/dl/dd')
    dd = []
    for i in detali_dd:
        dd.append(i.xpath('./text()').extract_first())
        # print('{}{}'.format(i.xpath('./dt/text()').extract_first(),i.xpath('./dd/text()').extract_first()))
        # print('{}'.format(i.xpath('./text()').extract_first()))

    a = dict(zip(dt,dd))
    print(a)

예제 #34

0

파일 보기

파일: book.py 프로젝트: bdubz14/IXperience

    def parse_book(self, response):
        link = response._get_url()
        r = requests.get(link)
        sel = Selector(r.text)
        book_data = {}
        title = sel.css('div.col-sm-6.product_main>h1')[0].extract().replace(
            '<h1>', '').replace('</h1>', '')
        df = pd.read_html(link)[0]

        book_data['category'] = sel.css('ul li')[2].extract().split(
            '</a>')[0].split('>')[2]
        book_data['title'] = title
        book_data['price'] = df[1][3]
        book_data['units left'] = int(df[1][5].replace('In stock (',
                                                       '').replace(
                                                           ' available)', ''))
        book_data['UPC'] = df[1][0]
        book_data['url'] = link
        book_data['description'] = sel.css('p')[3].extract().replace(
            '<p>', '').replace('</p>', '')
        book_data['rating'] = sel.css('p')[2].extract().split('\n')[0].replace(
            '<p class="star-rating ', '').replace('">', '')
        yield book_data

예제 #35

0

파일 보기

def ruleDetailWuHanShi(jsonData, detailUrl):
    for dat in jsonData.get('list'):
        # 发布时间
        bulletinIssueTime = dat.get('bulletinIssueTime')
        bulletinName = dat.get('bulletinName')

        # 获取表格中的所有数据
        bulletinContent = dat.get('bulletincontent')
        selectors = Selector(text=bulletinContent)

        # 标准的格式 类似这种: https://www.hbggzyfwpt.cn/jyxx/jsgcZbjggsDetail?guid=804c6b5b-e69f-4118-8d38-9c9d9413eb65&isOther=false
        zhaoBiaoRen = selectors.xpath(
            "//td[text()='建设单位(招标人)']/following-sibling::td[1]/text()").get(
                default='')
        # xiangMuMingCheng = selectors.xpath("//td[text()='报建项目名称']/following-sibling::td[1]/text()").get(default='')
        diZHi = selectors.xpath(
            "//td[text()='建设地址']/following-sibling::td[1]/text()").get(
                default='')
        zhongBiaoRen = selectors.xpath(
            "//td[text()='中标人']/following-sibling::td[1]/text()").get(
                default='')
        zhongBiaoJia = selectors.xpath(
            "//td[text()='中标价（万元）']/following-sibling::td[1]/text()").get(
                default='')

        items = {
            'bulletinIssueTime': bulletinIssueTime,
            'zhaoBiaoRen': zhaoBiaoRen,
            'bulletinName': bulletinName,
            'diZHi': diZHi,
            'zhongBiaoRen': zhongBiaoRen,
            'zhongBiaoJia': zhongBiaoJia,
            'detailUrl': detailUrl,
            'city': '武汉市'
        }

        yield items

예제 #36

0

파일 보기

def parsel_for_parse_page(response):
    """
    使用parsel模块解析网页数据
    :param response: 响应数据
    :return:
    """
    selector = Selector(response.text)

    # 定义映射关系
    mappings = {
        'vhk08k': 0,
        'vhk6zl': 1,
        'vhk9or': 2,
        'vhkfln': 3,
        'vhkbvu': 4,
        'vhk84t': 5,
        'vhkvxd': 6,
        'vhkqsc': 7,
        'vhkjj4': 8,
        'vhk0f1': 9
    }

    phone_element = selector.css('div.col.more > d').getall()
    tel_vhk_list = []
    for d_em in phone_element:
        d = Selector(d_em)
        x = d.css('d::attr("class")').get()
        tel_vhk_list.append(x)
    # print(tel_vhk_list)

    # 获取VSG的映射值
    tel_num_list = [mappings.get(i) for i in tel_vhk_list]
    tel_num_list = list(
        map(lambda x_: (str(x_) if x_ is not None else "-"), tel_num_list))

    phone = "".join(tel_num_list)
    print(phone)

예제 #37

0

파일 보기

def post(inputs):
    posted = []
    failed = []
    for week in inputs:
        try:
            data = urllib.request.urlopen(week).read()
        except urllib.error.URLError as e:
            failed.append(week)
            print(week)
            print(e.reason)
        if type(data) is bytes:
            data = data.decode("utf-8")
            hxs = Selector(text=data)
            posts = hxs.xpath(
                '//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href'
            ).extract()
            posted.append(posts)
        else:
            hxs = Selector(text=data)
            posts = hxs.xpath(
                '//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href'
            ).extract()
            posted.append(posts)
    return posted

예제 #38

0

파일 보기

def mon(inputs):
    week = []
    errored_out = []
    for month in inputs:
        try:
            data = urllib.request.urlopen(month).read()
        except urllib.error.URLError as e:
            print(month)
            errored_out.append(month)
            print(e.reason)
        if type(data) is bytes:
            data = data.decode("utf-8")
            hxs = Selector(text=data)
            weeks = hxs.xpath('//ul[@class="weeks"]/li/a').re(
                'http://www.businessweek.com/archive/\\d+-\\d+/news/day\\d+\.html'
            )
            week.append(weeks)
        else:
            hxs = Selector(text=data)
            weeks = hxs.xpath('//ul[@class="weeks"]/li/a').re(
                'http://www.businessweek.com/archive/\\d+-\\d+/news/day\\d+\.html'
            )
            week.append(weeks)
    return week

예제 #39

0

파일 보기

파일: AutoLocationTarget.py 프로젝트: 7134g/m_troops

def xpath_local_content(text, local_point="电话", contrast_point="地址") -> list:
    content = text.replace("&#12288;", "").replace("\u3000", "")
    # content = response.replace("&#12288;", "").replace("\u3000", "")
    selete = Selector(content)
    targets = [
        '//*[contains(text(),"{}")]'.format(local_point),
        '//*[contains(text(),"手机")]', '//*[contains(text(),"座机")]'
    ]
    second_target = '//*[contains(text(),"{}")]'.format(contrast_point)
    lenght = len(targets)
    first_local = []
    index = 0

    for index, first_value in enumerate(targets):
        first_local = selete.xpath(first_value)
        if first_local:
            break
    if not first_local:
        return []

    while index + 1 <= lenght:
        for count in range(1, 4):
            find_parent = '/..' * count
            first_local = selete.xpath(targets[index] + find_parent)
            second_local = selete.xpath(second_target + find_parent).extract()
            for second_value in second_local:
                for offset, value in enumerate(first_local):
                    result = value.extract()
                    # print(result)
                    if result == second_value:
                        result = selete.xpath(targets[index] + find_parent +
                                              '/*').extract()
                        return result
        index += 1

    return []

예제 #40

0

파일 보기

파일: video.py 프로젝트: Sustartpython/My-Python-Examples

def down_list():
    with open('1.html', mode='r', encoding='utf8') as f:
        text = f.read()
    html = Selector(text, 'html')
    dl = html.xpath("//li[@id='thA']//dl")
    for item in dl:
        item: Selector
        dt = item.xpath("./dt/a/text()").extract_first('')
        dd = item.xpath("./dd/a/text()").extract()
        dd_url = item.xpath("./dd/a/@href").extract()
        for i, small in enumerate(dd):
            name = dt + ';' + small
            filename = list_path + '/' + name
            if not os.path.exists(filename):
                os.makedirs(filename)
            list_url = "http://www.fzwjt.com" + dd_url[i]
            print(list_url)
            feature = 'PageBar41'
            res = utils.get_html(list_url, proxies=proxy, timeout=50)
            if res.content.decode('utf8').find(feature) == -1:
                file = '%s/%s.html' % (filename, 1)
                with open(file, mode='w', encoding='utf8') as f:
                    f.write(res.content.decode())
                utils.printf("下载", name, "成功...")
            else:
                html = Selector(res.text, 'html')
                page_num = html.xpath(
                    "//span[@class='PageBar41']//em[3]/text()").extract_first(
                        '')
                for page in range(1, int(page_num) + 1):
                    url = list_url + "&page={page}".format(page=page)
                    res = utils.get_html(url, proxies=proxy, timeout=50)
                    file = '%s/%s.html' % (filename, page)
                    with open(file, mode='w', encoding='utf8') as f:
                        f.write(res.content.decode())
                    utils.printf("下载", name, page, "成功...")

예제 #41

0

파일 보기

파일: 03_regex_examples.py 프로젝트: mclohrk/webscraping-semcomp2016

# -*- coding: utf-8 -*-
import re
import requests
from parsel import Selector

# Pode usar RegEx para que o parsel retorne uma lista contendo apenas uma parte específica do texto,
# Exemplo:
# texto: 08:00 ~ 12:00
# RegEx: .* (.+) ~.*
# retorno: 08:00
response = requests.get("http://semcomp.com.br/programacao")
sel = Selector(
    response.text).xpath(u".//*[@title='Horário']/text()").re(".* (.+) ~.*")
print("RegEx horários de palestras SemComp: %s" % sel)

# RegEx para validar um username baseado nas regras:
# pode possuir de 3 à 16 caracteres sendo eles letras, números, _ (underline) e - (hífen)
# Exemplo:
# username = "******"
# Valido
# username = "******"
# Invalido
username = "******"
pattern = re.compile("^[a-zA-Z0-9_-]{3,16}$")
if pattern.match(username):
    print("Valido")
else:
    print("Invalido")

# RegEx para encontrar emails em um site
response = requests.get(

예제 #42

0

파일 보기

def select_sidebar(sel: Selector) -> Selector:
    """Select the info sidebar."""
    return sel.xpath("//div[@class='js-scrollfix-bottom']")

예제 #43

0

파일 보기

import requests
from parsel import Selector
import pprint

url = 'http://www.porters.vip/verify/uas/index.html'
headers = {"User-Agent": "Postman"}
resp = requests.get(url, headers=headers)
print(resp.status_code)
if resp.status_code == 200:
    sel = Selector(resp.text)
    res = sel.css('.list-group-item::text').extract()
    pprint.pprint(res)
else:
    print("失败")

예제 #44

0

파일 보기

파일: main.py 프로젝트: wallisonferreira/linkedin_publics

# nessa etapa inicial o webdriver é aberto no diretório abaixo
# (Lembrando ser necessario baixar o webdriver antes)
nome_driver = Firefox()

# Recebe Link Url
link_url = 'https://ultimateqa.com/fake-landing-page'

# nessa etapa é aberto o linkedin via webdriver
nome_driver.get(link_url)

# Define Tempo de espera
tempo_maximo_espera = 30

# Busca uma tag pelo Xpath
caminho_xpath = '//h4[@class="et_pb_module_header"]'
# exemplo_xpath = '//tag_html[@atributo="valor"]/tag/tag/tipo()'

# Aguarda para continuar enquanto tag HTML definida não carregar pelo 
# tempo máximo definido: tempo_máximo_espera
WebDriverWait(nome_driver, tempo_maximo_espera).until(
        EC.presence_of_element_located((By.XPATH, caminho_xpath))
        ) 

# Continuação da Automação
fonte_pagina = Selector(text=nome_driver.page_source)

nome_objeto = fonte_pagina.xpath('//h4[@class="et_pb_module_header"]/a/@href').getall()

        print(nome_objeto)

예제 #45

0

파일 보기

파일: twitter.py 프로젝트: zammalhabe/SpyScrap

def twitter (name_to_search,page_number,knownimage,verbose):

    placeToSearch='twitter.com'
    chrome_options = Options()
    chrome_options.add_argument("--headless")
    chrome_options.add_argument("--no-sandbox")
    chrome_options.add_argument("--disable-dev-shm-usage")

    chrome_path = './chromedriver'
    driver = webdriver.Chrome(chrome_path,chrome_options=chrome_options)

    people_list=[]
    for i in range(int(page_number)):
        driver.get("https://www.google.com/search?q=site:"+placeToSearch+"+AND+"+name_to_search + "&start=" + str(10 * i))
        search=driver.find_elements_by_tag_name('a')
        time.sleep(10)

        for s in search:
            href=s.get_attribute('href')
            if href != None:
                if "https://twitter.com/" in href:
                    if "/status/" not in href and "/media" not in href and "/hashtag/" not in href and "webcache.googleusercontent.com" not in href and "google.com" not in href:
                        people_list.append(href)
                    elif "/hashtag/" not in href and "webcache.googleusercontent.com" not in href and "google.com" not in href:
                        if "/status/" in href:
                            people_list.append(href.split("/status/")[0])
                        elif "/media" not in s.text:
                            people_list.append(href.split("/media")[0])


    people_list=set(people_list)
    now = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    if not os.path.isdir("data/twitter"):
        os.mkdir("data/twitter");

    path=os.path.join('data/twitter',str(now)+'_twitter_data.json')
    jsonData=[]
    userLink = set()
    for p in people_list:
        if verbose:
            print("*******************************************************************************************************")
            print(p)
        driver.get(p)
        driver.implicitly_wait(50)
        time.sleep(2)

        sel = Selector(text=driver.page_source)

        name = sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[2]/div/div/div[1]/div/span[1]/span/text()').extract_first()
        link = p
        description = sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[3]/div/div/span[1]/text()').extract_first()
        location = sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[4]/div/span[1]/span/span/text()').extract_first()
        member_since = sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[4]/div/span[2]/svg/text()').extract_first()
        born=sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[4]/div/span[2]/svg/text()').extract_first()
        webpage=sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[4]/div/a/text()').extract_first()
        image_url=sel.xpath('//*[@id="react-root"]/div/div/div[2]/main/div/div/div/div/div/div[2]/div/div/div[1]/div/div[1]/a/div[1]/div[2]/div/img/@src').extract_first()
        if name==None:
            name=""
        if str(link) not in userLink:
            userLink.add(link)
            nameParts = name_to_search.split(' ')
            isMatcher = False
            for n in nameParts:
                if SequenceMatcher(None,n, name).ratio()>0.4 or SequenceMatcher(None,n,str(link)).ratio()>0.4 or n in str(description).lower():
                    isMatcher=True

            if SequenceMatcher(None,name_to_search, name).ratio()>0.4 or SequenceMatcher(None,name_to_search,str(link)).ratio()>0.4 or name_to_search in str(description).lower():
                isMatcher=True

            if isMatcher:
                userData = {}
                if verbose:
                    print("Name: "+str(name))
                    print("Link: "+str(link))
                    print("Description: "+str(description))
                    print("Location: "+ str(location))
                    print("Member since: "+str(member_since))
                    print("Born: "+str(born))
                    print("Web: "+str(webpage))
                    print ("Profile image url: "+str(image_url))
                    print('\n')
                    print('\n')

                if knownimage:
                    if not os.path.isdir("data/twitter/"+str(now)+"_images"):
                        os.mkdir("data/twitter/"+str(now)+"_images");
                    image=os.path.join("data/twitter/"+str(now)+"_images/"+str(link.split('.com/')[1])+".jpg")
                    try:
                        urllib.request.urlretrieve(image_url, image)
                        userData={'storedImage':image,'name':str(name),'link':str(link),'description':str(description),'location':str(location),'member_since':str(member_since),'born':str(born),'web':str(webpage),'image':str(image_url)}
                        jsonData.append(userData)
                    except:
                        pass
                else:
                    userData={'name':str(name),'link':str(link),'description':str(description),'location':str(location),'member_since':str(member_since),'born':str(born),'web':str(webpage),'image':str(image_url)}
                    jsonData.append(userData)

    with open(path, 'w+') as outfile:
        json.dump(jsonData, outfile)

    print("Results Twitter in: " + str(path))
    response={'results':str(path)}

    if len(people_list)>0:
        if knownimage:
            print("Compare similarity images.")
            face_identification(knownimage,'./data/twitter/'+str(now)+'_images/')
            response['images']='./data/twitter/'+str(now)+'_images/'
            response['recognized']='./data/twitter/'+str(now)+'_images/recognized/'
    driver.quit()
    return response

예제 #46

0

파일 보기

파일: scrape_jobs.py 프로젝트: rush175/selenium_project

def load_fields_to_csv(driver):
    recommended_links = []
    for tries in range(20):
        page = random.randint(0, 20) * 25
        driver.get(
            f'https://www.linkedin.com/jobs/search/?f_TPR=r604800&keywords=junior%20data%20scientist&start={page}'
        )
        # driver.get(f'https://www.linkedin.com/jobs/search/?f_TPR=r604800&keywords=junior%20data%20scientist&location=New%20York%20City%20Metropolitan%20Area&start={page}')
        sel = Selector(text=driver.page_source)
        job_listings = sel.xpath(
            '//a[contains(@href, "/jobs/view/")]/@href').getall()
        # alternate method:
        # sel.xpath('//a[has-class("disabled ember-view job-card-container__link job-card-list__title")]/@href').getall()
        job_listings = list(dict.fromkeys(job_listings))  # remove duplicates
        sleep(round(random.uniform(3, 20), 2))
        for listing in reversed(job_listings):
            if len(recommended_links) != 0:
                listing = recommended_links[0]
                recommended_links.pop(0)
            driver.get('https://www.linkedin.com' + listing)
            url = driver.current_url
            if 'linkedin.comhttps' in url:
                driver.get(listing)
            sleep(2)
            sel = Selector(text=driver.page_source)
            #######################################################
            job_title = sel.xpath(
                '//h1[has-class("jobs-top-card__job-title t-24")]/text()').get(
                )
            if job_title:
                job_title = ''.join(job_title)
                if type(job_title) == str:
                    job_title = job_title.strip()
                elif type(job_title) == list:
                    job_title = ''.join(job_title)
                    try:
                        job_title = job_title.strip()
                    except:
                        continue
            job_title = validate_field(job_title)
            company_name = sel.xpath(
                '//a[has-class("jobs-top-card__company-url ember-view")]/text()'
            ).get()
            if company_name:
                company_name = ''.join(company_name)
                if type(company_name) == str:
                    company_name = company_name.strip()
                elif type(company_name) == list:
                    company_name = ''.join(company_name)
                    try:
                        company_name = company_name.strip()
                    except:
                        continue
            company_name = validate_field(company_name)
            location = sel.xpath(
                '//a[has-class("jobs-top-card__exact-location t-black--light link-without-visited-state")]/text()'
            ).getall()
            if location:
                location = ''.join(location)
                if type(location) == str:
                    location = location.strip()
                elif type(location) == list:
                    location = ''.join(location)
                    try:
                        location = location.strip()
                    except:
                        continue
                location = validate_field(location)
            posted_days_ago = sel.xpath(
                '//p[has-class("mt1 full-width flex-grow-1 t-14 t-black--light")]'
            ).get()
            if posted_days_ago:
                posted_days_ago = ''.join(posted_days_ago)
                if type(posted_days_ago) == str:
                    posted_days_ago = posted_days_ago.strip()
                elif type(posted_days_ago) == list:
                    posted_days_ago = ''.join(posted_days_ago)
                    try:
                        posted_days_ago = posted_days_ago.strip()
                    except:
                        continue
            posted_days_ago = validate_field(posted_days_ago)
            seniority_level = sel.xpath(
                '//p[has-class("jobs-box__body js-formatted-exp-body")]/text()'
            ).getall()
            if seniority_level:
                seniority_level = ''.join(seniority_level)
                if type(seniority_level) == str:
                    seniority_level = seniority_level.strip()
                elif type(seniority_level) == list:
                    seniority_level = ''.join(seniority_level)
                    try:
                        seniority_level = seniority_level.strip()
                    except:
                        continue
            seniority_level = validate_field(seniority_level)
            industry_job_functions = sel.xpath(
                '//li[has-class("jobs-box__list-item jobs-description-details__list-item")]/text()'
            ).getall()
            if industry_job_functions:
                industry_job_functions = ''.join(industry_job_functions)
                if type(industry_job_functions) == str:
                    industry_job_functions = industry_job_functions.strip()
                elif type(industry_job_functions) == list:
                    industry_job_functions = ''.join(industry_job_functions)
                    try:
                        industry_job_functions = industry_job_functions.strip()
                    except:
                        continue
            industry_job_functions = validate_field(industry_job_functions)
            employment_type = sel.xpath(
                '//p[has-class("jobs-box__body js-formatted-employment-status-body")]/text()'
            ).get()
            if employment_type:
                employment_type = ''.join(employment_type)
                if type(employment_type) == str:
                    employment_type = employment_type.strip()
                elif type(employment_type) == list:
                    employment_type = ''.join(employment_type)
                    try:
                        employment_type = employment_type.strip()
                    except:
                        continue
            employment_type = validate_field(employment_type)
            job_description = sel.xpath(
                '//div[has-class("jobs-box__html-content jobs-description-content__text t-14 t-normal")]'
            ).getall()
            if job_description:
                job_description = ''.join(job_description)
                if type(job_description) == str:
                    job_description = job_description.strip()
                elif type(job_description) == list:
                    job_description = ''.join(job_description)
                    try:
                        job_description = job_description.strip()
                    except:
                        continue
            job_description = validate_field(job_description)
            base_salary = sel.xpath(
                '//p[has-class("salary-main-rail__data-amount t-24 t-black t-normal")]/text()'
            ).get()
            if base_salary:
                base_salary = ''.join(base_salary)
                if type(base_salary) == str:
                    base_salary = base_salary.strip()
                elif type(base_salary) == list:
                    base_salary = ''.join(base_salary)
                    try:
                        base_salary = base_salary.strip()
                    except:
                        continue
            base_salary = validate_field(base_salary)
            applicants = sel.xpath('//span[has-class("ml1")]/text()').get()
            if applicants:
                applicants = ''.join(applicants)
                if type(applicants) == str:
                    applicants = applicants.strip()
                elif type(applicants) == list:
                    applicants = ''.join(applicants)
            applicants = validate_field(applicants)
            all_similar_job_links = sel.xpath(
                '//a[has-class("job-card__link-wrapper js-focusable-card ember-view")]'
            ).getall()
            all_similar_job_links = validate_field(all_similar_job_links)
            if all_similar_job_links != 'NaN':
                for similar_job in all_similar_job_links:
                    start = similar_job.find('<')
                    end = similar_job.find('href="')
                    if start != -1 and end != -1:
                        result = similar_job[start:end + 6]
                        similar_job = similar_job.replace(result, '')
                        end_quote = similar_job.find('"')
                        rec_url = similar_job[0:end_quote]
                        recommended_links.append(rec_url)
            url = driver.current_url
            url = validate_field(url)
            try:
                with open(parameters.file_name, 'a', newline='') as csvfile:
                    spamwriter = csv.writer(csvfile,
                                            delimiter=',',
                                            quoting=csv.QUOTE_MINIMAL)
                    spamwriter.writerow([
                        job_title, company_name, location, posted_days_ago,
                        seniority_level, industry_job_functions,
                        employment_type, job_description, base_salary,
                        applicants, all_similar_job_links, url
                    ])
            except:
                continue
            sleep(round(random.uniform(3, 10), 2))
            return [
                job_title, company_name, location, posted_days_ago,
                seniority_level, industry_job_functions, employment_type,
                job_description, base_salary, applicants,
                all_similar_job_links, url
            ]


# driver.quit()

예제 #47

0

파일 보기

파일: check.py 프로젝트: bpoti001/Sentiment-Analysis-on-Financial-News-Articles

            hxs = Selector(text=data)
            posts = hxs.xpath('//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href').extract()
            posted.append(posts)
        else:
            hxs = Selector(text=data)
            posts = hxs.xpath('//ul[@class="archive"]/li/span[@class="channel markets_and_finance"]/following-sibling::h1/a/@href').extract()
            posted.append(posts)
    return posted
if __name__ == '__main__':
    print("in main")
    totalWeeks = []
    totalPosts = []
    url = 'http://www.businessweek.com/archive/news.html#r=404'
    data = urllib.request.urlopen(url).read()
    data = data.decode("utf-8") 
    sel = Selector(text=data)
    months = sel.xpath('//ul/li/a').re('http://www.businessweek.com/archive/\\d+-\\d+/news.html')
    #admittMonths = 12*(2015-1991) + 8
    m=[]
    for i in months:
        m.append([i])
    totalWeeks = []
    pool = Pool(8)
    totalWeeks= pool.map(mon,m)
    totalWeeks = [ent for sublist in totalWeeks for ent in sublist]
    print (len(totalWeeks))
    #club = [ent for sublist in totalWeeks for ent in sublist]
    #print (len(club))
    club = [ent for sublist in totalWeeks for ent in sublist]
    print (len(club))
    d=[]

예제 #48

0

파일 보기

파일: 9-4-1-one.py 프로젝트: zzzz123321/antispider

from selenium import webdriver
browser = webdriver.Chrome()
# 驱动Chrome浏览器打开滑动验证码示例页面
browser.get('http://www.porters.vip/captcha/jigsaw.html')
# 定位滑块
jigsawCircle = browser.find_element_by_css_selector('#jigsawCircle')
action = webdriver.ActionChains(browser)
# 点击并保持不松开
action.click_and_hold(jigsawCircle).perform()
# 返回当前页面的html代码
html = browser.page_source

import re
from parsel import Selector
sel = Selector(html)
# 获取圆角矩形和缺口的CSS样式
mbk_style = sel.css('#missblock::attr("style")').get()
tbk_style = sel.css('#targetblock::attr("style")').get()
# 编写用于从CSS样式中提取left属性值的匿名函数
extract = lambda x: ''.join(re.findall('left: (\d+|\d+.\d+)px', x))
# 调用匿名函数获取CSS样式中的left属性值
mbk_left = extract(mbk_style)
tbk_left = extract(tbk_style)
# 计算当前拼图验证码滑块所需移动的距离
distance = float(tbk_left) - float(mbk_left)

action.move_by_offset(distance, 0)  # 设置滑动距离
action.release().perform()  # 松开鼠标

예제 #49

0

파일 보기

파일: info_retrival.py 프로젝트: bpoti001/Sentiment-Analysis-on-Financial-News-Articles

     except urllib.error.URLError:
         pass
     except urllib.error.HTTPError:
         pass
     except timeout:
         pass
 else:
     fail.append(s[i]) 
     print ("failed to retive info from ",s[i],i)
     flag = True
 if flag ==True:
     pass
 else:
     clap = response.read()
     clap = clap.decode("utf-8") 
     h = Selector(text=clap)
     date = h.xpath('//meta[@content][@name="pub_date"]/@content').extract()
     if date:
         pass
     else:
         date = h.xpath('//meta[@content][@name="parsely-pub-date"]/@content').extract()
     key = h.xpath('//meta[@content][@name="keywords"]/@content').extract() 
     info = h.xpath('//div[@id = "article_body"]/p//text()').extract()
     if not info:
         info = h.xpath('//div[@class = "article-body__content"]/p//text()').extract()
     if len(info)>1:
         info = ' '.join(str(r) for r in info)
         info = info.replace(u"\xa0", u" ")
     if "T" in date[0]:
         date,t = date[0].split('T')
     else:

예제 #50

0

파일 보기

파일: test_selector.py 프로젝트: stummjr/parsel

 def test_make_links_absolute(self):
     text = u'<a href="file.html">link to file</a>'
     sel = Selector(text=text, base_url='http://example.com')
     sel.root.make_links_absolute()
     self.assertEqual(u'http://example.com/file.html', sel.xpath('//a/@href').extract_first())

예제 #51

0

파일 보기

def l_par_html(url):
    # 这个函数是用来获取链家网南京二手房的信息
    wr = requests.get(url, headers=headers, stream=True)
    sel = Selector(wr.text)
    h_test = sel.xpath('//h2[@class="total fl"]').extract()
    title = sel.xpath('//li//div//div[@class="title"]/a/text()').extract()
    pos1 = sel.xpath(
        '//li//div//div[@class="flood"]//div/a[@data-el="region"]/text()'
    ).extract()
    pos2 = sel.xpath(
        '//li//div//div[@class="flood"]//div/a[2]/text()').extract()
    houseInfo = sel.xpath(
        '//li//div//div[@class="address"]//div/text()').extract()
    followInfo = sel.xpath(
        '//li//div//div[@class="followInfo"]/text()').extract()
    tags = sel.xpath('//li//div//div[@class="tag"]').xpath(
        'string(.)').extract()
    # print(tags)
    #tag_taxfree = sel.xpath('//li//div//div[@class="tag"]/span[@class="taxfree"]/text()').extract()
    #tag_haskey = sel.xpath('//li//div//div[@class="tag"]/span[@class="haskey"]/text()').extract()
    total_price = sel.xpath(
        '//li//div//div[@class="priceInfo"]//div[@class="totalPrice"]//span/text()'
    ).extract()
    unit_price = sel.xpath(
        '//li//div//div[@class="priceInfo"]//div[@class="unitPrice"]//span/text()'
    ).extract()
    if  len(total_price) != len(title)  or \
            len(tags) != len(title) or len(followInfo) != len(title) or len(houseInfo) != len(title) or \
            len(pos1) != len(title) or len(pos2) != len(title):
        print("Warnings! Length of some item does not match.")
    #print(len(tags))
    pages_info = pd.DataFrame(list(
        zip(title, pos1, pos2, houseInfo, followInfo, tags, total_price,
            unit_price)),
                              columns=[
                                  'Title', 'Position1', 'Position2',
                                  'HouseInfo', "FollowInfo", "Tags",
                                  "Total_Price", "Unit_Price"
                              ])
    return pages_info

예제 #52

0

파일 보기

파일: scrape.py 프로젝트: codeformuenster/weihnachtsmarkt

                      json={'data': booth}, auth=('admin', 'SUPER-SECURE-PASSWORD'))
    r.raise_for_status()


with open('lamberti.json') as jsonfile:
    jsondata = json.load(jsonfile)

with open('lamberti.geojson') as geojsonfile:
    geojsondata = json.load(geojsonfile)

for booth in jsondata:
    booth_no = int(booth['nr'][-2:])
    r = requests.get(booth['url'])
    if r.status_code == 200:
        text = r.text
        selector = Selector(text=text)

        booth_name = selector.css('.booth-title::text').get()
        booth_descr = selector.css('.booth-body > p::text').getall()

        if isinstance(booth_descr, list):
            booth_descr = " ".join(booth_descr)

        booth_owner_company = selector.css(
            '.contactParticle--company::text').get()
        booth_owner_name = selector.css(
            '.contactParticle--name\:firstname\,lastname::text').get()
        booth_owner_street = selector.css(
            '.contactParticle--street::text').get()
        booth_owner_city = selector.css(
            '.contactParticle--city\:postal_code\,locality::text').get()

예제 #53

0

파일 보기

파일: test_year_number.py 프로젝트: Sustartpython/My-Python-Examples

    url = 'http://www.sinomed.ac.cn/lw/basicSearch.do?dbtype=lw&pageNo=1&pageSize=100&change=true&cmode=&flag=&time=&linkSearch=&more=&moreExp=&searchmore=&searchword=+%22{}%22%5B%E5%87%BA%E7%89%88%E5%B9%B4%5D&submitButton=&beginDate=&endDate='.format(
        str(year))
    message = (str(year), url)
    url_que.put(message)

while True:
    if not url_que.empty():
        try:
            message = url_que.get()
            year = message[0]
            url = message[1]
            res = requests.get(url, proxies=proxy)
            res.encoding = res.apparent_encoding
            fname = r'E:\work\SinoMed_tasks\博硕论文\html' + '/' + "%s.html" % str(
                year)
            html = Selector(res.text, 'html')
            totalnum = html.xpath(
                "//input[@id='itemTotal']/@value").extract_first('')
            if totalnum == '0':
                print("%s年无文章暂不下载" % year)
            else:
                allnum = int(totalnum)
                count += allnum
                with open(fname, 'w', encoding='utf8') as f:
                    f.write(res.text)
            print("%s年有%s文章" % (year, totalnum))
            print("一共有%s文章" % (str(count)))
        except Exception as e:
            print("停止时年份为%s" % year)
            print('停止时count的个数%s' % str(count))
            url_que.put(message)

예제 #54

0

파일 보기

import requests
from parsel import Selector
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from string import punctuation
from heapq import nlargest
from collections import defaultdict
from nltk.probability import FreqDist

url = 'http://www.saopaulo.sp.gov.br/spnoticias/ultimas-noticias/centro-paula-souza-e-ibm-apresentam-modelo-educacional-p-tech/'
text = requests.get(url).text
selector = Selector(text=text)

title = selector.xpath(
    '//header[contains(@class, "article-header")]//h1/text()').get()
legend = selector.xpath(
    '//header[contains(@class, "article-header")]//p/text()').get()
paragraphs = selector.xpath(
    '//article[contains(@class, "article-main")]//p/text()').getall()

text_to_analyse = ''

for paragraph in paragraphs:
    text_to_analyse += paragraph

sentences = sent_tokenize(text_to_analyse)
words = word_tokenize(text_to_analyse.lower())
stopwords = set(stopwords.words('portuguese') + list(punctuation))

words_without_stopwords = [word for word in words if word not in stopwords]

예제 #55

0

파일 보기

파일: loadnacp.py 프로젝트: andyfuse/declarations.com.ua

    def _parse_me(cls, base_fname):
        json_fname = "{}.json".format(base_fname)
        html_fname = "{}.html".format(base_fname)
        resp = {
            "intro": {},
            "declaration": {}
        }

        try:
            with open(json_fname, "r") as fp:
                data = json.load(fp)

            with open(html_fname, "r") as fp:
                raw_html = fp.read()
                html = Selector(raw_html)
        except ValueError:
            print(
                "File {} or it's HTML counterpart cannot be parsed".format(json_fname))
            return None
        except FileNotFoundError:
            print(
                "File {} or it's HTML counterpart cannot be found".format(json_fname))
            return None

        id_ = data.get("id")
        created_date = data.get("created_date")

        raw_html_lowered = raw_html.lower()
        for chunk in cls.dangerous_chunks:
            if chunk in raw_html_lowered:
                raise BadHTMLData("Dangerous fragment found: {}, {}".format(
                    id_, base_fname))

        try:
            data = data["data"]
        except KeyError:
            raise BadJSONData("API brainfart: {}, {}".format(id_, base_fname))

        if "step_0" not in data:
            raise BadJSONData("Bad header format: {}, {}".format(id_, base_fname))

        resp["_id"] = "nacp_{}".format(id_)
        resp["ft_src"] = "\n".join(cls.extract_textual_data(html))
        resp["nacp_orig"] = data
        resp["declaration"]["url"] = "https://public.nazk.gov.ua/declaration/{}".format(id_)
        resp["declaration"]["source"] = "NACP"
        resp["declaration"]["basename"] = os.path.basename(base_fname)

        resp["intro"]["corrected"] = id_ in cls.corrected
        resp["intro"]["date"] = cls.parse_date(created_date)

        if "declarationType" not in data["step_0"] or "changesYear" in data["step_0"]:
            resp["intro"]["doc_type"] = "Форма змін"

            if "changesYear" in data["step_0"]:
                resp["intro"]["declaration_year"] = int(data["step_0"]["changesYear"])
        else:
            resp["intro"]["doc_type"] = cls.declaration_types[data["step_0"]["declarationType"]]
            if "declarationYearTo" in data["step_0"]:
                resp["intro"]["declaration_year_to"] = cls.parse_date(data["step_0"]["declarationYearTo"])

            if "declarationYearFrom" in data["step_0"]:
                resp["intro"]["declaration_year_from"] = cls.parse_date(data["step_0"]["declarationYearFrom"])
                resp["intro"]["declaration_year"] = resp["intro"]["declaration_year_from"].year

            if "declarationYear1" in data["step_0"]:
                resp["intro"]["declaration_year"] = int(data["step_0"]["declarationYear1"])

            if "declarationYear3" in data["step_0"] and data["step_0"]["declarationYear3"]:
                resp["intro"]["declaration_year"] = int(data["step_0"]["declarationYear3"])

            if "declarationYear4" in data["step_0"] and data["step_0"]["declarationYear4"]:
                resp["intro"]["declaration_year"] = int(data["step_0"]["declarationYear4"])

        resp["general"] = {
            "last_name": replace_apostrophes(title(data["step_1"]["lastname"])),
            "name": replace_apostrophes(title(data["step_1"]["firstname"])),
            "patronymic": replace_apostrophes(title(data["step_1"]["middlename"])),
            "full_name": replace_apostrophes("{} {} {}".format(
                title(data["step_1"]["lastname"]),
                title(data["step_1"]["firstname"]),
                title(data["step_1"]["middlename"]),
            )),
            "post": {
                "post": replace_apostrophes(data["step_1"].get("workPost", "")),
                "post_type": replace_apostrophes(data["step_1"].get("postType", "")),
                "office": replace_apostrophes(data["step_1"].get("workPlace", "")),
                "actual_region": replace_apostrophes(cls.region_types.get(data["step_1"].get("actual_region", ""), "")),
                "region": replace_apostrophes(cls.region_types.get(data["step_1"].get("region", ""), "")),
            }
        }

        if "step_2" in data:
            family = data["step_2"]

            if isinstance(family, dict):
                resp["general"]["family"] = []

                for member in family.values():
                    if not isinstance(member, dict):
                        continue

                    resp["general"]["family"].append({
                        "family_name": replace_apostrophes("{} {} {}".format(
                            title(member.get("lastname", "")),
                            title(member.get("firstname", "")),
                            title(member.get("middlename", "")),
                        )),

                        "relations": member.get("subjectRelation", "")
                    })

        # get regions from estate list
        if "step_3" in data and isinstance(data["step_3"], dict) and data["step_3"]:
            if "estate" not in resp:
                resp["estate"] = []
            for estate in data["step_3"].values():
                if isinstance(estate, dict) and "region" in estate:
                    region = replace_apostrophes(cls.region_types.get(estate.get("region", ""), ""))
                    if region:
                        resp["estate"].append({"region": region})

        if "step_4" in data and isinstance(data["step_4"], dict) and data["step_4"]:
            if "estate" not in resp:
                resp["estate"] = []
            for estate in data["step_4"].values():
                if isinstance(estate, dict) and "region" in estate:
                    region = replace_apostrophes(cls.region_types.get(estate.get("region", ""), ""))
                    if region:
                        resp["estate"].append({"region": region})

        if "estate" in resp:
            estate_list = html.css(
                "table:contains('Місцезнаходження') td:contains('Населений пункт') span::text"
            ).extract()

            for estate in estate_list:
                region = cls.decode_region(estate)
                if region:
                    resp["estate"].append({"region": region})

        resp['general']['full_name_suggest'] = [
            {
                'input': resp['general']['full_name'],
                'weight': 5
            },
            {
                'input': ' '.join(
                    [
                        resp['general']['name'],
                        resp['general']['patronymic'],
                        resp['general']['last_name']
                    ]
                ),
                'weight': 3
            },
            {
                'input': ' '.join(
                    [
                        resp['general']['name'],
                        resp['general']['last_name']
                    ]
                ),
                'weight': 3
            }
        ]

        resp['general']['full_name_for_sorting'] = keyword_for_sorting(resp['general']['full_name'])

        if not resp["general"]["post"]["region"]:
            region_html = html.css(
                "fieldset:contains('Зареєстроване місце проживання') .person-info:contains('Місто')::text"
            ).extract()
            if len(region_html) > 1:
                resp["general"]["post"]["region"] = cls.decode_region(region_html[1])

        if not resp["general"]["post"]["actual_region"]:
            region_html = html.css(
                "fieldset:contains('Місце фактичного проживання') .person-info:contains('Місто')::text"
            ).extract()
            if len(region_html) > 1:
                resp["general"]["post"]["actual_region"] = cls.decode_region(region_html[1])

        # if set only one region use it value for second one
        if not resp["general"]["post"]["actual_region"] and resp["general"]["post"]["region"]:
            resp["general"]["post"]["actual_region"] = resp["general"]["post"]["region"]
        elif not resp["general"]["post"]["region"] and resp["general"]["post"]["actual_region"]:
            resp["general"]["post"]["region"] = resp["general"]["post"]["actual_region"]

        resp["index_card"] = concat_fields(resp, NACPDeclaration.INDEX_CARD_FIELDS)

        return NACPDeclaration(**resp).to_dict(True)

예제 #56

0

파일 보기

파일: crawl_peotry.py 프로젝트: shenhao-stu/2021_computer_design

def parse(html, callback, *args, **kwargs):
    html = html.decode('utf8')
    html = html.encode('latin1')
    html = html.decode('gb2312', 'ignore')
    sel = Selector(text=html)
    return callback(sel, *args, **kwargs)

예제 #57

0

파일 보기

파일: doyoubuzzfix.py 프로젝트: yacinbenhamida/resumely-js

    return country

client = pymongo.MongoClient("mongodb+srv://ybh:[email protected]/resumely?retryWrites=true&w=majority")
database = client["resumelydb"]
profiles_collection = database['profiles']
done = set()
extracted_data = {}
extracted_data['candidates'] = []
driver = webdriver.Chrome(executable_path='C:\chromedriver_win32\chromedriver_80')
pickle.dump( driver.get_cookies() , open("cookies.pkl","wb"))

driver.maximize_window()
for x in profiles_collection.find(no_cursor_timeout=True).skip(3600):
    driver.get(x['profile'])
    sleep(5)
    sel = Selector(text=driver.page_source) 
    age = sel.xpath('//*[starts-with(@class,"widgetUserInfo__item widgetUserInfo__item_age")]/text()').extract_first()
    if age:
        age = age.strip()    
    age = validate_field(age)   
    try:
        if age != 'No Results':
            age = ' '.join(age.split())
            for a in age.split():
                if a.isdigit():
                    profiles_collection.update_one({"_id" : bson.ObjectId(x['_id'])},{ "$set": { "age": int(a) } })
    except:
        pass
    # experiences
    experiencesTab = []
    skills = []

예제 #58

0

파일 보기

파일: matchticker.py 프로젝트: Granitas/gosuticker

 def find_matches(self, sel: Selector) -> Generator[Match, None, None]:
     """
     Generator to find live and upcoming matches in parsel.Selector object
     :returns: Generator for Match objects
     """
     yield from self._find_matches(sel.xpath("//table[@id='gb-matches']//tr"))

예제 #59

0

파일 보기

    def parse_playInfo(self, response):
        play = copy.deepcopy(response.meta["playInfoObj"])
        # 赛事id---轮次 由上层传入
        # url
        play["play_urls"] = response.url
        # 赛事时间
        time_str = response.css("div.qbox_1 div.qbx_2 p::text").extract_first()
        if time_str == None:
            play["play_time"] = None
        else:
            if time_str.encode("utf-8") == "延期":
                play["play_result_detail"] = "延期"
            else:
                time_year = time_str.split("-")[0]
                if int(time_year) >= 0 and int(time_year) < 30:
                    time_str = "20" + time_str
                else:
                    time_str = "19" + time_str
                time_str = time_str[0:10] + " " + time_str[11:]
                play["play_time"] = time_str

        # 主队
        play["team_home"] = response.css(
            "#matchTeam div.qpai_zi::text").extract_first()
        # 客队
        play["team_vis"] = response.css(
            "#matchTeam div.qpai_zi_1::text").extract_first()
        #
        score_half = response.css("div.jifen_dashi p").extract_first()
        if score_half != None:
            score_half_p = Selector(
                text=score_half).xpath("//p/text()").extract_first()
            if score_half_p != None:
                score_half = score_half_p
            if score_half != None:
                # 半:1-1
                score_half_arr = score_half.strip()[2:].split(" ")[0].split(
                    "-")
                # 比分半场主
                play["half_home"] = score_half_arr[0]
                # 比分半场客
                play["half_vis"] = score_half_arr[1]
            else:
                # 比分半场主
                play["half_home"] = None
                # 比分半场客
                play["half_vis"] = None
        # $("#matchTeam div.vs span")[0]
        score_full = response.css("#matchTeam div.vs span").extract()
        if score_full == None or len(score_full) == 0:
            # 比分全场主
            play["full_home"] = None
            # 比分全场客
            play["full_vis"] = None
            # 赛事结果
            play["play_result"] = None
        else:
            # 比分全场主
            play["full_home"] = int(
                Selector(
                    text=score_full[0]).css("::text").extract_first().strip())
            # 比分全场客
            play["full_vis"] = int(
                Selector(
                    text=score_full[1]).css("::text").extract_first().strip())
            # 赛事结果
            if play["full_home"] == play["full_vis"]:
                play["play_result"] = 1
            else:
                if play["full_home"] > play["full_vis"]:
                    play["play_result"] = 3
                else:
                    play["play_result"] = 0

        #
        # 指数详情-额外的请求
        # /soccer/match/954629/odds/ajax/?page=0&trnum=5&companytype=BigBooks&type=1
        # /soccer/match/954629/odds/ajax/?page=0&trnum=5&companytype=AuthoriteBooks&type=1
        odds_url = "/soccer/match/{0}/odds/ajax/?page=0&trnum=5&companytype=AuthoriteBooks&type=1".format(
            play["id"])
        return [
            scrapy.Request(url=self.base_url + odds_url,
                           headers=self.headers,
                           meta={
                               'cookiejar': response.meta['cookiejar'],
                               "playInfoObj": play
                           },
                           callback=self.parse_oddsInfo)
        ]

예제 #60

0

파일 보기

파일: matchticker.py 프로젝트: Granitas/gosuticker

 def find_history(self, sel: Selector) -> Generator[Match, None, None]:
     """
     Generator to find recent matches in parsel.Selector object
     :returns: Generator for Match objects
     """
     yield from self._find_matches(sel.xpath("//h2[contains(text(),'Recent')]/..//tr"))