Exemplo n.º 1
0
    def download_brackets(self, url):
        """
        Experimental
        Download and display brackets in the terminal
        """
        try:
            from terminalbrackets import Team, Bracket
        except ImportError:
            sys.exit('For brackets functionality "terminalbrackets" package is required')
        resp = requests.get(url)
        sel = Selector(text=resp.text)

        brackets = sel.css('.bracket-scroller')
        xtr_brackets = []
        for bracket in brackets:
            bracket_name = ''.join(bracket.xpath('../preceding-sibling::h3[1]//text()').extract())

            rounds = bracket.css('.bracket-column-matches')
            xtr_rounds = []
            for r in rounds:
                teams = r.xpath('.//div[contains(@class,"bracket-cell")]')
                xtr_teams = []
                for t in teams:
                    name = t.css('.team-template-team-bracket span::text').extract_first('')
                    score = t.css('.bracket-score::text').extract_first(0)
                    if name and score:
                        xtr_teams.append(Team(name, score))
                xtr_rounds.append(xtr_teams)
            xtr_brackets.append(Bracket(bracket_name, xtr_rounds))
        print(xtr_brackets)
        return xtr_brackets
Exemplo n.º 2
0
 def find_tournaments(self, category=None):
     """
     :param category: what category to show,
         choice from: EVENT_CURRENT[default], EVENT_PAST, EVENT_FUTURE
     :return: list of Events
     """
     if category is None:
         category = EVENT_CURRENT
     resp = requests.get(self.game_url)
     sel = Selector(text=resp.text)
     ongoing_events = sel.xpath("//li[contains(text(),'{}')]/..//a".format(category))
     if not ongoing_events:
         ongoing_events = sel.xpath("//div[contains(text(),'COMPLETED')]"
                                    "/following-sibling::div/a")
     ongoing = []
     for t in ongoing_events:
         event = Event()
         event['name'] = t.xpath('text()').extract_first('')
         event['date'] = t.xpath('small/text()').extract_first('').strip('()')
         event['url'] = urljoin(self.url_base, t.xpath('@href').extract_first(''))
         resp = requests.get(event['url'])
         sel = Selector(text=resp.text)
         info = sel.xpath("//div[contains(@class,'infobox-header')][contains(text(), 'League Info')]/../..")
         event['info'] = dict()
         for node in info.xpath("//div[contains(@class,'infobox-description')]"):
             title = node.xpath('text()').extract_first('').lower().strip(':')
             value = ''.join(node.xpath('following-sibling::div//text()').extract()).strip()
             url = node.xpath('following-sibling::div/a/@href').extract_first('')
             event['info'][title] = {'value': value, 'url': urljoin(self.url_base, url)} if url else value
         ongoing.append(event)
     return ongoing
Exemplo n.º 3
0
 def parse_node(self, response: TextResponse, selector: Selector):
     return {
         'title': selector.xpath('title/text()').get(),
         'link': selector.xpath('link/text()').get(),
         'guid': selector.xpath('guid/text()').get(),
         'pub_date': selector.xpath('pubDate/text()').get(),
         'description': selector.xpath('description/text()').get(),
     }
Exemplo n.º 4
0
    def basic_info(self):
        '''
        基本数据,没有仓位的
        拿到的只是上证的数据, ??? 中证吧
        :return:
        '''

        r = requests.get(
            url='http://www.csindex.com.cn/zh-CN/search/indices?about=1',
            headers={'User-Agent': 'Molliza Firefox Chrome'})

        response = Selector(text=r.text)
        table = response.xpath(
            '//table[@class="table table-even table-bg  tc p_table tablePage"]'
        )
        index_list = table[0].xpath('.//tbody[@id="itemContainer"]/tr')

        for idx in index_list:
            code = idx.xpath('.//td[1]/a/text()').extract_first()
            detail_url = idx.xpath('.//td[1]/a/@href').extract_first()
            name = idx.xpath('.//td[2]/a/text()').extract_first()
            stock_count = idx.xpath('.//td[3]/text()').extract_first()
            price = idx.xpath('.//td[4]/text()').extract_first()
            month_ratio = idx.xpath('.//td[5]/text()').extract_first()
            month_ratio = month_ratio.replace('--', '')
            if len(month_ratio) == 0:
                month_ratio = 0

            type_ = idx.xpath('.//td[6]/text()').extract_first()
            hot_pot = idx.xpath('.//td[7]/text()').extract_first()
            area = idx.xpath('.//td[8]/text()').extract_first()
            coin = idx.xpath('.//td[9]/text()').extract_first()
            specified = idx.xpath('.//td[10]/text()').extract_first()
            index_type = idx.xpath('.//td[11]/text()').extract_first()

            obj = IndexObject(代码=code,
                              详细URL=detail_url,
                              指数名称=name,
                              股票数目=stock_count,
                              最新收盘=float(price),
                              一个月收益率=float(month_ratio),
                              资产类别=type_,
                              热点=hot_pot,
                              地区覆盖=area,
                              币种=coin,
                              定制=specified,
                              指数类别=index_type)

            try:
                self.sess.add(obj)
            except Exception as e:
                logger.error(e)
                self.sess.rollback()
            else:
                self.sess.commit()
Exemplo n.º 5
0
async def parser(request):
    args = request.method == 'GET' and dict(request.query) or dict(
        await request.json())
    url = args['url']
    xpath = args['xpath']
    type = args.get('type', 'backend')
    proxy = args.get('proxy', None)
    api = '{}?url={}&type={}{}'.format(config.DOWNLOAD_API, url, type,
                                       '&proxy={}'.format(proxy) or '')
    async with ClientSession() as session:
        async with session.get(url=api) as response:
            bytes = await response.read()
            html = bytes.decode('utf-8')
    if html:
        time_begin = time.time()
        xhtml = Selector(text=html)
        result = xpath.startswith('{') and get_fields(
            xhtml, xpath) or get_urls(xhtml, xpath, parse.unquote(url))
        time_pass = str(round(time.time() - time_begin, 3))
    else:
        result = ''
        time_pass = '******'
    log = '{} {}s {}'.format(datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
                             time_pass.rjust(6, '0'), parse.unquote(url))
    print(log, result)
    return web.Response(text=json.dumps(result, ensure_ascii=False),
                        content_type='text/text',
                        charset='utf-8')
Exemplo n.º 6
0
class Parser:
    def __init__(self, text_or_selector):
        if isinstance(text_or_selector, str):
            self.selector = Selector(text_or_selector)
        else:
            self.selector = text_or_selector

    def get_text(self, selector, transform=None):
        text = self.selector.css(f'{selector}::text').get()
        if text:
            text = text.strip()
            if transform is not None:
                text = transform(text)
            return text
        return None

    def get_int(self, selector, transform=None):
        text = self.get_text(selector, transform)
        if text is None:
            return 0
        text = non_numeric.sub('', text)
        return int(text)

    def get_float(self, selector, transform=None):
        text = self.get_text(selector, transform)
        if text is None:
            return 0.0
        text = non_numeric.sub('', text)
        return float(text)

    def get_date(self, selector, transform=None):
        text = self.get_text(selector, transform)
        if text is None:
            return None
        return datetime.strptime(text, '%B %d, %Y').date()
Exemplo n.º 7
0
 def get_qz_page(self, sess, code):
     '''
     获取权重页面
     :return:
     '''
     # 获取权重
     qz_url = 'http://www.csindex.com.cn/zh-CN/indices/index-detail/{}'
     s1 = sess.get(qz_url.format(code), headers={'User-Agent': 'Molliza Firefox Chrome'})
     return Selector(text=s1.text)
Exemplo n.º 8
0
def parse_job(html) -> dict:
    """find job details in job listing page"""
    sel = Selector(text=html)
    # setup some processing helpers
    join = lambda css, sep='': sep.join(sel.css(css).extract()).strip()
    first = lambda css: sel.css(css).extract_first(' ').strip()

    item = {}
    item['title'] = sel.css('h2.title::text').extract_first()
    item['location'] = join('.job-meta a::text', ', ')
    item['job_type'] = join('ul.list-unstyled a::text')
    item['posted_date'] = join(
        'div#affix-box p:contains("Posted:")::text').split(': ')[1]
    item['saved_times'] = join(
        'div#affix-box div:contains("Saved ")>strong::text')
    item['description'] = join('div.box-item-details p ::text')
    item['views'] = first(
        'div#affix-box li:contains("unique views")>strong::text')
    item['unique_views'] = first(
        'div#affix-box li:contains("views")>strong::text')

    bullets = lambda css: [
        ''.join(bullet.css('::text').extract()) for bullet in sel.css(css)
    ]
    h4_bullet = 'div.box-item-details h4:contains("{}")+ul>li'.format
    h3_bullet = 'div.box-item-details h3:contains("{}")+ul>li'.format
    item['about_you'] = bullets(h4_bullet('About You'))
    item['your_role'] = bullets(h4_bullet('Your role'))
    item['requirements'] = bullets(h4_bullet('Requirements'))
    item['nice_to_have'] = bullets(h4_bullet('Nice to have'))
    item['why_work_with_us'] = bullets(h4_bullet('Why work with us'))
    item['desired_skills'] = bullets(h3_bullet('Desired Skills'))
    item['contact'] = bullets(h3_bullet('Contact Info'))

    return item
Exemplo n.º 9
0
def top10Pass(driver, writer):
	driver.get('https://www.google.com')
	# locate search form by_name
	search_query = driver.find_element_by_name('q')
	# send_keys() to simulate the search text key strokes
	search_query.send_keys(parameters.query)
	# send_keys() to simulate the return key 
	search_query.send_keys(Keys.RETURN)

	# wait for page to load
	sleep(3.0)

	i = 0
	while(True):
		google_list_page = driver.current_url
		linked_in_urls = getProfileURLs(driver)
		for linked_in_url in linked_in_urls[:]:
			# get the profile URL
			driver.get(linked_in_url)
			# add a 5 second pause loading each URL
			sleep(5)
			# assigning the source code for the webpage to variable sel
			sel = Selector(text=driver.page_source)
			# get full name (first + last name) of user
			fullName = getFullName(sel)
			# returns a list of degree name followed by
			# field of study in alternating sequence
			degreeInfo = getDegreeInfo(sel)
			# returns a list of year started and year completed for each degree.
			# only need the first element to determine how long primary
			# degree has been held
			dateInfo = getDateInfo(sel)
			# write to csv file
			flushOutput(writer, fullName, degreeInfo, dateInfo)

		try:
			driver.get(google_list_page)
			next_result_element = driver.find_element_by_id('pnnext')
			next_page = next_result_element.get_attribute('href')
			driver.get(next_page)
		except NoSuchElementException:
			print('No next page found ')
			break
		i += 1
		if (i > 30):
			break
	return
Exemplo n.º 10
0
def get_fields(xhtml, xpath):
    fields_xpath = json.loads(xpath)
    one_fields = {}
    # 固定采集
    for fix_field in fields_xpath['fix_fields']:
        one_fields[fix_field['field']] = fix_field['xpath']
    # 单条采集
    is_valid_one_fields = False  # 采集数据是否有效
    for one_field in fields_xpath['one_fields']:
        _xpaths = one_field['xpath'].split('|')
        if _xpaths[0].startswith('/'):  # xpath
            content = xpath_extract(xhtml=xhtml, xpath=_xpaths[0], mode='one')
        else:  # regex
            content = regex_extract(xhtml=xhtml, regex=_xpaths[0], mode='one')
        content = text_handler(content, _xpaths[1:])
        is_valid_one_fields = content and True or is_valid_one_fields
        one_fields[one_field['field']] = content
        # 多条采集
    all_fields = []
    if fields_xpath['all_xpath']:
        sections = xpath_extract(xhtml=xhtml,
                                 xpath=fields_xpath['all_xpath'],
                                 mode='all')
        for section in sections:
            is_valid_all_fields = False
            section = Selector(text=section)
            fields = {}
            for all_field in fields_xpath['all_fields']:
                _xpaths = all_field['xpath'].split('|')
                if _xpaths[0].startswith('/'):  # xpath
                    content = xpath_extract(xhtml=section,
                                            xpath=_xpaths[0],
                                            mode='one')
                else:  # regex
                    content = regex_extract(xhtml=section,
                                            regex=_xpaths[0],
                                            mode='one')
                content = text_handler(content, _xpaths[1:])
                is_valid_all_fields = content and True or is_valid_all_fields
                fields[all_field['field']] = content
            if is_valid_one_fields or is_valid_all_fields:
                fields = dict(fields, **one_fields)
                all_fields.append(fields)
        return all_fields
    else:
        one_fields = is_valid_one_fields and [one_fields] or []
        return one_fields
Exemplo n.º 11
0
def main(argv):

    csv_name_file = open(argv[1], 'r')
    reader = csv.reader(csv_name_file)
    next(reader)

    # build professor dictionary containing lastName-firstName key-value pairs
    profMap = buildProfMap(reader)
    print(profMap)

    file_list = os.listdir('htmlDownloads/htmlPages/')
    writer = csv.writer(open(argv[2], 'w', newline=''))
    writer.writerow([
        'LastName', 'FirstName', 'Degree', 'FOS', 'YearStarted', 'YearEarned'
    ])

    # Add .html page names to a list
    filtered_list = []
    for file_name in file_list[:]:
        if file_name.endswith("LinkedIn.html"):
            if "_ Search _" not in file_name:
                filtered_list.append(file_name)
    file_list = filtered_list

    for lastName in profMap:
        file_name = None
        # find html page according to last name
        file_name = findUsersHtmlPage(file_list, lastName)
        if not file_name:
            print(lastName + " has no page!")
            continue
        htmlDoc = readHtmlDoc(file_name)
        sel = Selector(text=htmlDoc)
        fullName = getFullName(sel)
        degreeInfo = getDegreeInfo(sel)
        dateInfo = getDateInfo(sel)
        print(fullName)
        print(degreeInfo)
        print(dateInfo)
        flushOutput(writer, fullName, degreeInfo, dateInfo)

    return
Exemplo n.º 12
0
def extract_attachment(html: str,
                       content_url: str,
                       attachment_format_list=[]) -> list:
    """
    用于提取 html 中的附件名字及链接

    :param html: html
    :param content_url: html的原文url, 用于拼接附件链接
    :param attachment_format_list: 除了基础的附件格式 pdf, xls, doc, ppt, wps,txt, ceb 还可新增附件格式,如: xxx
    :return: e.g. [{"attachment_name": "附件1", "attachment_url": "http://xxx.com/P020180202506411419197.pdf"}]
    """
    attachment_format = '|'.join(
        set(ATTACHMENT_REGEXES +
            attachment_format_list))  # 'pdf|xls|doc|ppt|wps...'
    attachment_format_pattern = re.compile(f'\.({attachment_format})[a-z]?$',
                                           flags=re.IGNORECASE)

    suspect_attachment_list = Selector(node_a_to_text(html)).xpath('//a')
    attachment_dict = {}
    for s in suspect_attachment_list:
        if not s.xpath('./@href').re_first(attachment_format_pattern):
            continue
        origin_file_name = s.xpath('string()').get('').strip() or \
                           s.xpath('./@title').get('').strip() or \
                           s.xpath('./@textvalue').get('').strip()
        if not origin_file_name:
            # continue # 存在一些附件名为空的情况
            logger.warning(
                f"Get a empty attachment name, origin node is ==={s.get()}===, content_url={content_url}"
            )

        attachment_name = attachment_format_pattern.sub('', origin_file_name)
        attachment_url = urljoin(content_url, s.xpath('./@href').get())
        if attachment_dict.get(fp := fingerprint(attachment_url)) is not None and \
                len(attachment_name) <= len(attachment_dict[fp]['attachment_name']):  # 取文件名最长的
            continue
        file_info = {
            'attachment_name': attachment_name,
            'attachment_url': attachment_url
        }
        logger.debug(f"Get attachment successful: {file_info}")
        attachment_dict[fp] = file_info
Exemplo n.º 13
0
 def destination(self, flight: Selector) -> Optional[str]:
     return flight.xpath('(.//span[@jsname="d6wfac"])[4]/text()').get()
Exemplo n.º 14
0
 def duration(self, flight: Selector) -> Optional[str]:
     return flight.xpath(
         './/div[@class="gvkrdb AdWm1c tPgKwe ogfYpf"]/text()').get()
Exemplo n.º 15
0
 def arrival(self, flight: Selector, search_date: date) -> Optional[str]:
     arrival_date = flight.xpath('.//g-bubble[2]').xpath(
         './/span[@class="eoY5cb"]/text()').get()
     return self._convert_date(arrival_date, search_date)
Exemplo n.º 16
0
 def departure(self, flight: Selector, search_date: date) -> Optional[str]:
     departure_date = flight.xpath('.//g-bubble[1]').xpath(
         './/span[@class="eoY5cb"]/text()').get()
     return self._convert_date(departure_date, search_date)
Exemplo n.º 17
0
 def operator(self, flight: Selector) -> Optional[str]:
     return flight.xpath(
         './/div[@class="TQqf0e sSHqwe tPgKwe ogfYpf"]/span/text()').get()
Exemplo n.º 18
0
 def price(self, flight: Selector) -> Optional[str]:
     return flight.xpath('.//div[@class="YMlIz tu4g7b"]/span/text()').get()
Exemplo n.º 19
0
def main(argv):

	writer = csv.writer(open(argv[2], 'w', newline=''))
	writer.writerow(['FirstName'
		'LastName',
		'Degree',
		'FOS',
		'YearStarted',
		'YearEarned'])

	# instantiates the chrome driver 
	driver = webdriver.Chrome(parameters.driverDirectory)
	# driver.get method() will navigate to a page given by the URL address
	driver.get('https://www.linkedin.com')
	# logs the user into their LinkedIn account
	driverLogIn(driver)
	delayRequest()

	profMap = buildProfessorMap(argv)

	# user-targeted google search approach
	for user in profMap:
		google_query = parameters.query
		google_query += ' AND "' + user + '"'
		driver.get('https://www.google.com')
		# locate search form by_name
		search_query = driver.find_element_by_name('q')
		# send_keys() to simulate the search text key strokes
		search_query.send_keys(google_query)
		delayRequest()
		# send_keys() to simulate the return key 
		search_query.send_keys(Keys.RETURN)
		# add a 5 second pause loading each URL
		linked_in_urls = getProfileURLs(driver)
		# go to the page contained in the first result of the query
		delayRequest()
		if len(linked_in_urls) > 0:
			driver.get(linked_in_urls[0])
		if ('authwall' in driver.current_url):
			driverLogIn(driver) 
		delayRequest()
		# assign the source code for the webpage to variable sel
		sel = Selector(text=driver.page_source)
		# get full name (first + last name) of user
		fullName = getFullName(sel)
		# does the page belong to the user undergoing search?
		if fullName:
			if (user not in fullName):
				print('Name \"' + user + '\" in file does not match \"' + fullName + '\"')
			else:
				print(user + " found!")
		# returns a list of degree name followed by
		# field of study in alternating sequence
		degreeInfo = getDegreeInfo(sel)
		# returns a list of year started and year completed for each degree.
		# only need the first element to determine how long primary
		# degree has been held
		dateInfo = getDateInfo(sel)
		# write to csv file
		flushOutput(writer, user, degreeInfo, dateInfo)
		# wait before redirecting to google search page

	# terminates the application
	driver.quit()
	return
Exemplo n.º 20
0
 def selector(self):
     if self._selector is None:
         self._selector = Selector(text=self._response.text)
     return self._selector
Exemplo n.º 21
0
 def stops(self, flight: Selector) -> Optional[str]:
     return flight.xpath(
         './/span[@class="pIgMWd ogfYpf"]/@aria-label').get()
Exemplo n.º 22
0
 def selector(self) -> parsel.Selector:
     from parsel.selector import Selector
     if self._cached_selector is None:
         self._cached_selector = Selector(self.html)
     return self._cached_selector
Exemplo n.º 23
0
def parse_pagination(html) -> Tuple[List, str]:
    """find all job links and next page link in pagination html"""
    sel = Selector(text=html)
    jobs = sel.css('div.item h3 a::attr(href)').extract()
    next_page = sel.css('a[aria-label=Next]::attr(href)').extract_first()
    return jobs, next_page
Exemplo n.º 24
0
    def selector(self):
        """Returns the current state of the browser in a Selector

        We re-parse the site on each xpath, css, re call because we are running a web browser
        and the site may change between calls"""
        return Selector(text=self.page_source)
Exemplo n.º 25
0
 def __init__(self, text_or_selector):
     if isinstance(text_or_selector, str):
         self.selector = Selector(text_or_selector)
     else:
         self.selector = text_or_selector