Пример #1
0
def artverify(art, html='', pdf=''):
    """
    Check whether HTML and PDF documents match abstract text
    Arguments:
        html (str): HTML text (optional)
        pdf (str): PDF text (optional)
    """

    # Cast article to Article
    art = toart(art)

    # Get article info
    info = artinfo({'xml' : art.xml})

    # Quit if no abstract
    if info['abstxt'] is None:
        return None, None

    # Tokenize abstract
    abstxt = info['abstxt']
    abswords = re.split('\s+', abstxt)
    abswords = [word.lower() for word in abswords]

    # Ignore punctuation
    for char in ['.', ',', ';', ':']:
        abswords = [word.strip(char) for word in abswords]

    # Load HTML
    if not html:
        html = loadhtml(art, overwrite=True)
    
    # Load PDF
    if not pdf:
        pdf = loadpdf(art)
        pdf = to_unicode(pdf)

    # To lower-case
    html = html.lower()
    pdf = pdf.lower()

    # Check HTML
    if html:
        htmlwords = [word for word in abswords if html.find(word) > -1]
        htmlprop = float(len(htmlwords)) / len(abswords)
    else:
        htmlprop = None

    # Check PDF
    if pdf:
        pdfwords = [word for word in abswords if pdf.find(word) > -1]
        pdfprop = float(len(pdfwords)) / len(abswords)
    else:
        pdfprop = None

    # Return
    return htmlprop, pdfprop
Пример #2
0
def is404Error(html, debug=False):
    #     strToFile(html, getExecDirectory(__file__) + "/tmp/test.html")
    #     exit()
    # If we found any of this in the first "<title(.*)</title>", it's a 404:
    match404Title = [
        "404", "error", "not found", "Moved Temporarily", "401 Unauthorized",
        "403 Forbidden", "Request Timeout", "Too Many Requests",
        "Service Unavailable", "404 ", " 404", "404 not found",
        "page not found", "404<", ">404", "Moved Temporarily",
        "401 Unauthorized", "403 Forbidden", "Request Timeout",
        "Too Many Requests", "Service Unavailable"
    ]
    titleResult = re.finditer("<title(.*)</title>", html, re.DOTALL)
    if titleResult is None:
        return True
    titleResult = list(titleResult)
    if len(titleResult) == 0:
        return True
    title = None
    for current in titleResult:
        title = current.group(1)
        if title is None:
            return True
        if len(title) >= 1:
            title = title[1:]
        title = title.lower()
        break
    for current in match404Title:
        if current.lower() in title:
            if debug:
                print(">>>>> " + current)
            return True
    # Or if any of this is in the body:
    match404Body = [
        "404 not found", "page not found", "404<", ">404", "Moved Temporarily",
        "401 Unauthorized", "403 Forbidden", "Request Timeout",
        "Too Many Requests", "Service Unavailable"
    ]
    htmlLower = html.lower()
    for current in match404Body:
        if current.lower() in htmlLower:
            if debug:
                print(">>>>> " + current)
            return True
    # Else we return True
    return False
Пример #3
0
def getTitle(url):
  response = urllib.urlopen(url)
  html = response.read()
  html = html.replace(r'\"', '"')
  soup = BeautifulSoup(html.lower())
  urlTitle = soup.find('title')
  try:
    urlTitleText = urlTitle.text
  except:
    try:
      t = lxml.html.parse(url)
      urlTitleText = t.find(".//title").text
    except:
      print "title not found"
      print url
      urlTitleText = ""
  
  return urlTitleText.lower()
Пример #4
0
def prep_for_search(html):
    html = strip_tags_django(html)
    html = html.lower()
    html = xhtml_unescape_tornado(html)

    return html[:100000]
Пример #5
0
def traverseURLSet():
  matrix = numpy.zeros(shape=(len(urlDict),len(urlDict)))
  for urlID in urlDict:
    l = urlDict[urlID]
    response = urllib.urlopen(l.url)
    html = response.read()
    html = html.replace(r'\"', '"')
    soup = BeautifulSoup(html.lower())
    alinks = soup.findAll('a')
    
    if alinks:
      for alink in alinks:
        
        try:
          hrefFound = alink['href']
        except:
          hrefFound = ""
          
        if(re.match("mail",hrefFound)):
          continue
        
        if(re.search("#",hrefFound)):
          hrefFound = hrefFound.split("#")[0]
        
        if hrefFound.rstrip("/") == hrefFound:
          if hrefFound != "" and not re.search("html$",hrefFound) and not re.search("htm$",hrefFound) and not re.search("css$",hrefFound):
            hrefFound =  hrefFound + "/"
      
        urlFound = urljoin(l.url, hrefFound)
       
        if(re.search("#",urlFound)):
          urlFound = urlFound.split("#")[0]
        
        if urlFound in urlUrlIDPair:
          print alink, urlFound
          row = urlUrlIDPair[l.url]
          col = urlUrlIDPair[urlFound]
          matrix[row][col]=1
          
          try:
            alinkText = alink.text
          except:
            alinkText = ""
              
          urlID = urlUrlIDPair[urlFound]
          l1 = urlDict[urlID]
          
          print l.url
          print urlFound
          print alinkText
          
          l1.addAnchorText(alinkText)
          
          try:
            alinksoup = BeautifulSoup(str(alink))
            img = alinksoup.find('img')
            alinkText = img['alt']
          except:
            alinkText = ""
          
          l1.addAnchorText(alinkText)
        
    else:
      print "No links found in", l.url
    
  return matrix
    def get_my_qh(self, cr, uid, context=None):
        """
            获取我的期货
        """

        _logger.info("-------------------->开始查询我持有的期货")

        my_user_obj = self.pool.get("cwz.qihuo.user")
        user_ids = my_user_obj.search(cr, uid, [])
        if user_ids and len(user_ids) > 0:
            user_list = my_user_obj.read(cr, uid, user_ids, ['lxt'], context=context)

            # _logger.info("-------------------->我的lxt:" + str((user_list[0])['lxt']))

            num = random.randrange(100000002, 936619604)
            url = 'http://g.lexun.com/qh/myqh.php?cd=0&lxt=' + str((user_list[0])['lxt']) + '&_r=' + str(num) + '&vs=1'
            html = urllib2.urlopen(url).read()

            page = etree.HTML(html.lower().decode('utf-8'))
            my_qh_list = page.xpath(u"//div")

            # 我的期货
            my_qh_web_list = []

            for i, x in enumerate(my_qh_list):
                qh_line = lxml.html.tostring(x, pretty_print=True, encoding='utf-8')
                if "近期走势" in qh_line and i != 0:

                    qh_line_code = qh_line
                    qh_code = re.compile('''detail.php\?typeid=(.*?)&amp;cd=0''').findall(qh_line_code)[0]
                    qh_line_name = qh_line
                    qh_line_name = qh_line_name.replace("\n", "")
                    qh_name = re.compile('''z_banner02">(.*?):''').findall(qh_line_name)[0]
                    dian = qh_name.find(".")
                    qh_name = qh_name[(dian + 1):]
                    qh_line_num = qh_line
                    qh_line_num = qh_line_num.replace("\n", "")
                    qh_num = re.compile('''共持有:(.*?)股''').findall(qh_line_num)[0]
                    qh_now_price = re.compile('''当前价:(.*?)乐币''').findall(qh_line_num)[0]
                    qh_old_price = re.compile('''成本价:(.*?)乐币''').findall(qh_line_num)[0]
                    qh_sum_price = re.compile('''总成本:(.*?)乐币''').findall(qh_line_num)[0]
                    qh_date = re.compile('''购入时间:(.*?)<br>''').findall(qh_line_num)[0]

                    qh_amount = ''
                    if 'color' in qh_line_num:
                        qh_amount = re.compile('''color:.*">(.*)%''').findall(qh_line_num)[0]
                    else:
                        qh_amount = re.compile(qh_name + ''':(.*)%''').findall(qh_line_num)[0]
                        qh_amount = str(float(qh_amount))

                    trend_str = ''

                    if float(qh_amount) > 0:
                        trend_str = '↑'
                    elif float(qh_amount) == 0:
                        trend_str = '→'
                    else:
                        trend_str = '↓'

                    my_qh_web_list.append({
                        'code': qh_code,
                        'name': qh_name,
                        'now_price': CharactersUtil.chinese_to_num(qh_now_price),
                        'old_price': CharactersUtil.chinese_to_num(qh_old_price),
                        'sum_price': CharactersUtil.chinese_to_num(qh_sum_price),
                        'num': CharactersUtil.chinese_to_num(qh_num),
                        'date': CharactersUtil.to_utc_time(qh_date),
                        'amount': qh_amount,
                        'amount_str': qh_amount + "%",
                        'trend': trend_str
                    })

                    # 查询本地的期货
                    qh_obj = self.pool.get('qh.myself')
                    ids = qh_obj.search(cr, uid, [])
                    res = qh_obj.read(cr, uid, ids, ['name', 'code', 'id'], context)
                    res = [(r['name'], r['code'], r['id']) for r in res]

                    qh_local_list = []
                    for qh in res:
                        qh_local_list.append(qh[1])

                    #增加没有的期货
                    for qh in my_qh_web_list:
                        if not qh['code'] in qh_local_list:
                            qh_obj.create(cr, uid, {
                                'now_price': qh['now_price'],
                                'name': qh['name'],
                                'old_price': qh['old_price'],
                                'sum_price': qh['sum_price'],
                                'num': qh['num'],
                                'date': qh['date'],
                                'amount': qh['amount'],
                                'amount_str': qh['amount_str'],
                                'trend': qh['trend'],
                                'code': qh['code']}, context=context)
                        else:
                            write_ids = qh_obj.search(cr, uid, [('code', '=', qh['code'])])
                            qh_obj.write(cr, uid, write_ids, {
                                'now_price': qh['now_price'],
                                'name': qh['name'],
                                'old_price': qh['old_price'],
                                'sum_price': qh['sum_price'],
                                'num': qh['num'],
                                'date': qh['date'],
                                'amount': qh['amount'],
                                'amount_str': qh['amount_str'],
                                'trend': qh['trend'],
                                'code': qh['code']}, context=context)

                    #删除已经卖出的期货
                    for qh in qh_local_list:
                        i = 0
                        for web_qh in my_qh_web_list:
                            if qh == web_qh['code']:
                                i += 1
                                if i == 0:
                                    print '------>', qh, '已经卖出了!删除ing..'
                                    ids = qh_obj.search(cr, uid, [('code', '=', qh)])
                                    qh_obj.unlink(cr, uid, ids, context=context)
                                    print '------>删除完成'
Пример #7
0
def prep_for_search(html):
    html = strip_tags_django(html)
    html = html.lower()
    html = xhtml_unescape_tornado(html)
    
    return html[:100000]