Exemplo n.º 1
0
def parse_res_list(cont):
    ret = []
    if cont == '':
        return set([])
    for url in str_util.cut_windows(cont, 'http%3a%2f%2f', '.pdf'):
        if url.find('<') >= 0 or url.find('>') >= 0:
            continue
        ret.append('http://%s.pdf' % url)
    for url in str_util.cut_windows(cont, 'https%3a%2f%2f', '.pdf'):
        if url.find('<') >= 0 or url.find('>') >= 0:
            continue
        ret.append('https://%s.pdf' % url)
    return set(ret)
Exemplo n.º 2
0
def get_department(cont):
    r1 = str_util.cut_windows(cont,
                              '就诊科室:</span><span class=\"fl txt-right\">',
                              '</span></p>')
    if len(r1) > 0:
        return r1[0].strip()
    return ''
Exemplo n.º 3
0
def get_group(cont):
    r1 = str_util.cut_windows(cont,
                              '易感人群:</span><span class=\"fl txt-right\">',
                              '</span></p>')
    if len(r1) > 0:
        return r1[0].strip()
    return ''
Exemplo n.º 4
0
    def parse_link(self, ori_url, page):
        #for search results
        new_url_list = []
        if ori_url.startswith('https://search.yahoo.com/search'):
            try:
                page = page.decode('utf-8')
            except:
                print('page decode failed', file=sys.stderr)

            for link in str_util.cut_windows(page, 'http%3a%2f%2f', '.pdf'):
                if link.find('<') >= 0 or link.find('>') >= 0:
                    continue
                link = 'http://%s.pdf' % link.replace('%2f', '/')
                new_url_list.append(link)

        soup = BeautifulSoup(page, 'html.parser')
        for a in soup.findAll('a', href=True):
            link = a['href']
            if link != '' and link.startswith('/'):
                link = get_site(ori_url) + link
                if not link.startswith('http:'):
                    link = 'http://' + link
            valid = self.check_link(link)
            if self.debug:
                print('link\t%s\t%s\t%d' % (ori_url, link, valid),
                      file=sys.stderr)
            if not valid:
                continue
            new_url_list.append(link)

        link_num = 0
        lock.acquire()
        for link in new_url_list:
            url_visited = check_key(self.url_db_dir, link)
            if url_visited:
                continue
            key = bytes(link, encoding='utf-8')
            add_kv(self.url_db_dir, key, b'')
            if len(self.url_queue) < self.queue_max_size:
                self.url_queue.append(link)
                link_num += 1
            else:
                print('warning: queeu size exceed', file=sys.stderr)
        lock.release()
        return link_num
Exemplo n.º 5
0
	def parse_link(self, ori_url, page):
		#for search results
		link_num = 0
		if ori_url.startswith('https://search.yahoo.com/search'):
			try:
				page = page.decode('utf-8')
			except:
				print('page decode failed', file = sys.stderr)

			for link in str_util.cut_windows(page, 'http%3a%2f%2f', '.pdf'):
				if link.find('<') >= 0 or link.find('>') >= 0:	
					continue
				link = 'http://%s.pdf' % link.replace('%2f', '/')
				lock.acquire()
				self.url_queue.append(link)
				#self.url_queue.append('http://' + urllib.parse.urlparse(link).netloc)
				link_num += 1
				lock.release()

		soup = BeautifulSoup(page, 'html.parser')
		for a in soup.findAll('a',href=True):
			link = a['href']
			if link != '' and link.startswith('/'):
				link = get_site(ori_url) + link
			#print('link\t%s\t%s' % (ori_url, link))
			if not self.check_link(link):
				continue
			lock.acquire()
			if len(self.url_queue) < self.queue_max_size:
				self.url_queue.append(link)
				link_num += 1
			else:	
				print('warning: queue size exceed', file = sys.stderr)
			lock.release()
	
		return link_num
Exemplo n.º 6
0
def get_dis(cont):
    r1 = str_util.cut_windows(cont, '<title>', '的症状')
    if len(r1) > 0:
        return r1[0]
    return ''
Exemplo n.º 7
0
def get_sym(cont):
    r1 = str_util.cut_windows(cont, 'db f12 lh240 mb15', '</span>')
    if len(r1) != 1:
        return []
    r2 = str_util.cut_windows(r1[0], '\"_blank\">', '</a>')
    return r2
Exemplo n.º 8
0
import sys, os, time, random

sys.path.append('../../../alg/basic')
import str_util

i = 1
cont = open(sys.argv[1], 'rb').read()
retdir = sys.argv[2]
for part in str_util.cut_windows(cont, b'page: ', b'\n~EOF!\n'):
    fp = open('%s/%d.pdf' % (retdir, i), 'wb')
    fp.write(part)
    fp.close()
    i += 1