def artverify(art, html='', pdf=''): """ Check whether HTML and PDF documents match abstract text Arguments: html (str): HTML text (optional) pdf (str): PDF text (optional) """ # Cast article to Article art = toart(art) # Get article info info = artinfo({'xml' : art.xml}) # Quit if no abstract if info['abstxt'] is None: return None, None # Tokenize abstract abstxt = info['abstxt'] abswords = re.split('\s+', abstxt) abswords = [word.lower() for word in abswords] # Ignore punctuation for char in ['.', ',', ';', ':']: abswords = [word.strip(char) for word in abswords] # Load HTML if not html: html = loadhtml(art, overwrite=True) # Load PDF if not pdf: pdf = loadpdf(art) pdf = to_unicode(pdf) # To lower-case html = html.lower() pdf = pdf.lower() # Check HTML if html: htmlwords = [word for word in abswords if html.find(word) > -1] htmlprop = float(len(htmlwords)) / len(abswords) else: htmlprop = None # Check PDF if pdf: pdfwords = [word for word in abswords if pdf.find(word) > -1] pdfprop = float(len(pdfwords)) / len(abswords) else: pdfprop = None # Return return htmlprop, pdfprop
def is404Error(html, debug=False): # strToFile(html, getExecDirectory(__file__) + "/tmp/test.html") # exit() # If we found any of this in the first "<title(.*)</title>", it's a 404: match404Title = [ "404", "error", "not found", "Moved Temporarily", "401 Unauthorized", "403 Forbidden", "Request Timeout", "Too Many Requests", "Service Unavailable", "404 ", " 404", "404 not found", "page not found", "404<", ">404", "Moved Temporarily", "401 Unauthorized", "403 Forbidden", "Request Timeout", "Too Many Requests", "Service Unavailable" ] titleResult = re.finditer("<title(.*)</title>", html, re.DOTALL) if titleResult is None: return True titleResult = list(titleResult) if len(titleResult) == 0: return True title = None for current in titleResult: title = current.group(1) if title is None: return True if len(title) >= 1: title = title[1:] title = title.lower() break for current in match404Title: if current.lower() in title: if debug: print(">>>>> " + current) return True # Or if any of this is in the body: match404Body = [ "404 not found", "page not found", "404<", ">404", "Moved Temporarily", "401 Unauthorized", "403 Forbidden", "Request Timeout", "Too Many Requests", "Service Unavailable" ] htmlLower = html.lower() for current in match404Body: if current.lower() in htmlLower: if debug: print(">>>>> " + current) return True # Else we return True return False
def getTitle(url): response = urllib.urlopen(url) html = response.read() html = html.replace(r'\"', '"') soup = BeautifulSoup(html.lower()) urlTitle = soup.find('title') try: urlTitleText = urlTitle.text except: try: t = lxml.html.parse(url) urlTitleText = t.find(".//title").text except: print "title not found" print url urlTitleText = "" return urlTitleText.lower()
def prep_for_search(html): html = strip_tags_django(html) html = html.lower() html = xhtml_unescape_tornado(html) return html[:100000]
def traverseURLSet(): matrix = numpy.zeros(shape=(len(urlDict),len(urlDict))) for urlID in urlDict: l = urlDict[urlID] response = urllib.urlopen(l.url) html = response.read() html = html.replace(r'\"', '"') soup = BeautifulSoup(html.lower()) alinks = soup.findAll('a') if alinks: for alink in alinks: try: hrefFound = alink['href'] except: hrefFound = "" if(re.match("mail",hrefFound)): continue if(re.search("#",hrefFound)): hrefFound = hrefFound.split("#")[0] if hrefFound.rstrip("/") == hrefFound: if hrefFound != "" and not re.search("html$",hrefFound) and not re.search("htm$",hrefFound) and not re.search("css$",hrefFound): hrefFound = hrefFound + "/" urlFound = urljoin(l.url, hrefFound) if(re.search("#",urlFound)): urlFound = urlFound.split("#")[0] if urlFound in urlUrlIDPair: print alink, urlFound row = urlUrlIDPair[l.url] col = urlUrlIDPair[urlFound] matrix[row][col]=1 try: alinkText = alink.text except: alinkText = "" urlID = urlUrlIDPair[urlFound] l1 = urlDict[urlID] print l.url print urlFound print alinkText l1.addAnchorText(alinkText) try: alinksoup = BeautifulSoup(str(alink)) img = alinksoup.find('img') alinkText = img['alt'] except: alinkText = "" l1.addAnchorText(alinkText) else: print "No links found in", l.url return matrix
def get_my_qh(self, cr, uid, context=None): """ 获取我的期货 """ _logger.info("-------------------->开始查询我持有的期货") my_user_obj = self.pool.get("cwz.qihuo.user") user_ids = my_user_obj.search(cr, uid, []) if user_ids and len(user_ids) > 0: user_list = my_user_obj.read(cr, uid, user_ids, ['lxt'], context=context) # _logger.info("-------------------->我的lxt:" + str((user_list[0])['lxt'])) num = random.randrange(100000002, 936619604) url = 'http://g.lexun.com/qh/myqh.php?cd=0&lxt=' + str((user_list[0])['lxt']) + '&_r=' + str(num) + '&vs=1' html = urllib2.urlopen(url).read() page = etree.HTML(html.lower().decode('utf-8')) my_qh_list = page.xpath(u"//div") # 我的期货 my_qh_web_list = [] for i, x in enumerate(my_qh_list): qh_line = lxml.html.tostring(x, pretty_print=True, encoding='utf-8') if "近期走势" in qh_line and i != 0: qh_line_code = qh_line qh_code = re.compile('''detail.php\?typeid=(.*?)&cd=0''').findall(qh_line_code)[0] qh_line_name = qh_line qh_line_name = qh_line_name.replace("\n", "") qh_name = re.compile('''z_banner02">(.*?):''').findall(qh_line_name)[0] dian = qh_name.find(".") qh_name = qh_name[(dian + 1):] qh_line_num = qh_line qh_line_num = qh_line_num.replace("\n", "") qh_num = re.compile('''共持有:(.*?)股''').findall(qh_line_num)[0] qh_now_price = re.compile('''当前价:(.*?)乐币''').findall(qh_line_num)[0] qh_old_price = re.compile('''成本价:(.*?)乐币''').findall(qh_line_num)[0] qh_sum_price = re.compile('''总成本:(.*?)乐币''').findall(qh_line_num)[0] qh_date = re.compile('''购入时间:(.*?)<br>''').findall(qh_line_num)[0] qh_amount = '' if 'color' in qh_line_num: qh_amount = re.compile('''color:.*">(.*)%''').findall(qh_line_num)[0] else: qh_amount = re.compile(qh_name + ''':(.*)%''').findall(qh_line_num)[0] qh_amount = str(float(qh_amount)) trend_str = '' if float(qh_amount) > 0: trend_str = '↑' elif float(qh_amount) == 0: trend_str = '→' else: trend_str = '↓' my_qh_web_list.append({ 'code': qh_code, 'name': qh_name, 'now_price': CharactersUtil.chinese_to_num(qh_now_price), 'old_price': CharactersUtil.chinese_to_num(qh_old_price), 'sum_price': CharactersUtil.chinese_to_num(qh_sum_price), 'num': CharactersUtil.chinese_to_num(qh_num), 'date': CharactersUtil.to_utc_time(qh_date), 'amount': qh_amount, 'amount_str': qh_amount + "%", 'trend': trend_str }) # 查询本地的期货 qh_obj = self.pool.get('qh.myself') ids = qh_obj.search(cr, uid, []) res = qh_obj.read(cr, uid, ids, ['name', 'code', 'id'], context) res = [(r['name'], r['code'], r['id']) for r in res] qh_local_list = [] for qh in res: qh_local_list.append(qh[1]) #增加没有的期货 for qh in my_qh_web_list: if not qh['code'] in qh_local_list: qh_obj.create(cr, uid, { 'now_price': qh['now_price'], 'name': qh['name'], 'old_price': qh['old_price'], 'sum_price': qh['sum_price'], 'num': qh['num'], 'date': qh['date'], 'amount': qh['amount'], 'amount_str': qh['amount_str'], 'trend': qh['trend'], 'code': qh['code']}, context=context) else: write_ids = qh_obj.search(cr, uid, [('code', '=', qh['code'])]) qh_obj.write(cr, uid, write_ids, { 'now_price': qh['now_price'], 'name': qh['name'], 'old_price': qh['old_price'], 'sum_price': qh['sum_price'], 'num': qh['num'], 'date': qh['date'], 'amount': qh['amount'], 'amount_str': qh['amount_str'], 'trend': qh['trend'], 'code': qh['code']}, context=context) #删除已经卖出的期货 for qh in qh_local_list: i = 0 for web_qh in my_qh_web_list: if qh == web_qh['code']: i += 1 if i == 0: print '------>', qh, '已经卖出了!删除ing..' ids = qh_obj.search(cr, uid, [('code', '=', qh)]) qh_obj.unlink(cr, uid, ids, context=context) print '------>删除完成'