Пример #1
0
def analyseurl(urls):
    """
	功能:分析urls,返回列表格式的字典

	字典格式:{'name':names,'urls':url}
	这里将符合要求的页面信息插入数据库,还包括日志信息
	还包括 key的判断????
	"""
    returns = []
    print urls
    html = urllib2.urlopen(urls, timeout=50)
    try:
        conn = sqlite3.connect(options.dbfile)
        cor = conn.cursor()
        cor.execute(
            'create table if not exists keyofhtml( id integer primary key,urls text,key text,htmls text)'
        )
        data = html.read()
        rr = re.compile(r"""content\=["|']text\/html\;charset\=(\w*?)["|']""")
        m = rr.search(data)
        if m:
            code = m.group(1)
        if code:
            data = data.decode(code)
        rekey = re.compile(keyinsys)
        good = rekey.search(data)
        if good:
            data = data.replace("'", '"')  #纠结的单引号怎么处理?
            sqls = "insert into keyofhtml(urls,key,htmls) values('%s','%s','%s')"
            cor.execute(sqls % (urls, keyinsys, data))
            conn.commit()
        conn.close()
        logging2.debug('reading ' + urls)
        logging2.info('what should i write here')
        logging2.warning('a warning here')
        logging2.error('a error test here')
        logging2.critical('what is a critical??')
        #print 'reading'
    except:
        print 'error'
        logging2.error('error ong reading ' + urls)
    soup = BeautifulSoup.BeautifulSoup(data)
    temp = soup.findAll('a', href=re.compile(r'http.*'))  #为什么不直接用re匹配a标签
    logging2.debug('analysing ' + urls)
    #print 'analysing'
    for tt in temp:
        hrefs = tt['href']  #have?
        if hrefs.startswith('http'):
            if tt.string:  #span?????
                returns.append({'name': tt.string, 'urls': hrefs})
            else:
                returns.append({'name': 'NoName', 'urls': hrefs})
        else:
            continue
    return returns
Пример #2
0
def analyseurl(urls):
	"""
	功能:分析urls,返回列表格式的字典

	字典格式:{'name':names,'urls':url}
	这里将符合要求的页面信息插入数据库,还包括日志信息
	还包括 key的判断????
	"""
	returns=[]
	print urls
	html = urllib2.urlopen(urls,timeout=50)
	try:
		conn = sqlite3.connect(options.dbfile)
		cor = conn.cursor()
		cor.execute('create table if not exists keyofhtml( id integer primary key,urls text,key text,htmls text)')
		data = html.read()
		rr = re.compile(r"""content\=["|']text\/html\;charset\=(\w*?)["|']""")
		m = rr.search(data)
		if m:
			code = m.group(1)
		if code:
			data = data.decode(code)
		rekey = re.compile(keyinsys)
		good = rekey.search(data)
		if good:
			data = data.replace("'",'"')#纠结的单引号怎么处理?
			sqls = "insert into keyofhtml(urls,key,htmls) values('%s','%s','%s')"
			cor.execute(sqls%(urls,keyinsys,data))
			conn.commit()
		conn.close()
		logging2.debug('reading '+urls)
		logging2.info('what should i write here')
		logging2.warning('a warning here')
		logging2.error('a error test here')
		logging2.critical('what is a critical??')
		#print 'reading'
	except:
		print 'error'
		logging2.error('error ong reading '+urls)
	soup = BeautifulSoup.BeautifulSoup(data)
	temp = soup.findAll('a',href=re.compile(r'http.*'))#为什么不直接用re匹配a标签
	logging2.debug('analysing '+urls)
	#print 'analysing'
	for tt in temp:
		hrefs = tt['href']#have?
		if hrefs.startswith('http'):
			if tt.string:#span?????
				returns.append({'name':tt.string,'urls':hrefs})
			else:
				returns.append({'name':'NoName','urls':hrefs})
		else:
			continue
	return returns
Пример #3
0
			data = data.replace("'",'"')#纠结的单引号怎么处理?
			sqls = "insert into keyofhtml(urls,key,htmls) values('%s','%s','%s')"
			try:
				cor.execute(sqls%(urls,keyinsys,data))
			except UnicodeDecodeError,e:
				#print e
				cor.execute(sqls%(urls,keyinsys,'decode error'))
				logging2.error('reading '+urls+' decode error')
			conn.commit()
			#print 'donessss'
		conn.close()
		logging2.debug('reading '+urls)
		logging2.info('what should i write here')
		logging2.warning('a warning here')
		logging2.error('a error test here')
		logging2.critical('what is a critical??')
		#print 'reading'
	#except:
		#print 'error'
		#logging2.error('error ong reading '+urls)
	return returns



def main():
	i = 0
	th = threading2.ThreadPool(workQueue,resultQueue,options.number)
	td = threading2.MyThread2(workQueue,resultQueue,i,10)#屏幕打印进程

	while i <= options.deep:#层次循环
		if i == 0:
Пример #4
0
            data = data.replace("'", '"')  #纠结的单引号怎么处理?
            sqls = "insert into keyofhtml(urls,key,htmls) values('%s','%s','%s')"
            try:
                cor.execute(sqls % (urls, keyinsys, data))
            except UnicodeDecodeError, e:
                #print e
                cor.execute(sqls % (urls, keyinsys, 'decode error'))
                logging2.error('reading ' + urls + ' decode error')
            conn.commit()
            #print 'donessss'
        conn.close()
        logging2.debug('reading ' + urls)
        logging2.info('what should i write here')
        logging2.warning('a warning here')
        logging2.error('a error test here')
        logging2.critical('what is a critical??')
    return returns


def main():
    """
	执行入口,层次判断,任务转移.

	>>> main()
	  时间   深度    当前完成    待完成

	"""
    i = 0
    th = threading2.ThreadPool(workQueue, resultQueue, options.number)
    td = threading2.MyThread2(workQueue, resultQueue, i, 10)  #屏幕打印进程