Exemplo n.º 1
0
def scrape(crno):
    crnostr = "%07d" % crno
    baseurl = "https://www.mobile-cr.gov.hk/mob/cps_criteria.do?queryCRNO="
    url = baseurl + crnostr

    print "trying local", crnostr
    html = load_local(url)
    if html is None:
        print "trying site", crnostr
        html = scraperwiki.scrape(url).decode('utf-8')
        print "storing local", crnostr
        store_local(url, html.encode('utf-8'))
    else:
        html = html.decode('utf-8')

    if '沒有紀錄與輸入的查詢資料相符' in html.encode('utf-8'):
        print 'NO MATCHING RECORD FOUND FOR THE SEARCH INFORMATION INPUT!'
        return nil
    root = lxml.html.fromstring(html) # , encoding="utf-8")
    tds = root.cssselect("tr td tr td")
    namestds = root.cssselect("td.data")   

    while tds == []:
        print "trying", crnostr, "again"
        sleep(46)
        html = scraperwiki.scrape(baseurl + crnostr).decode('utf-8')
        root = lxml.html.fromstring(html) # , encoding="utf-8")
        tds = root.cssselect("tr td tr td")
        namestds = root.cssselect("td.data")   

        #for idx, val in enumerate(tds):
        #    print idx, ":", val.text_content().encode('utf-8')
    names = {}
    for nameidx, nameval in enumerate(namestds):
        names["Name" + str(nameidx)] = nameval.text_content()[10:]
        names["Name" + str(nameidx) + "date"] = nameval.text_content()[:10]

    print "got", tds[1].text_content() 

    data = {
        'cr' : tds[1].text_content(),
        'English Company Name' : tds[2].text_content().rsplit('\r')[1].lstrip('\n\t'),
        'Chinese Company Name' : tds[2].text_content().rpartition('\r')[2].lstrip('\r\n\t'),
        'Company Type' : tds[4].text_content()[:-1],
        'Date of incorporation' : tds[6].text_content(),
        # 'Company status' : tds[8].text_content()[:-1],
        'Active status' : tds[8].text_content()[:-1],
        'Remarks' : tds[9].text_content().replace(u"備註:",""),
        'Winding up mode' : tds[11].text_content()[:-1],
        'Date of Dissolution' : tds[13].text_content(),
        'Register of Charges' : tds[15].text_content()[:-1],
        'Important Note' : tds[16].text_content().replace(u"重要事項:","").lstrip('\r\n\t')
    }
    data.update(names)
    
    db['swdata'].upsert(data, ['cr'])
    print "wrote", tds[1].text_content()
Exemplo n.º 2
0
def main():
    import optparse

    optparser = optparse.OptionParser(
        description="Transforms Hansard XML from the Canadian House of Commons into "
        "an easy-to-process HTML format. If no options are specified, reads XML from stdin."
    )
    optparser.add_option("-f", "--file", dest="filename", help="Process the XML file at FILE")
    optparser.add_option(
        "-i",
        "--docid",
        dest="docid",
        help="Document ID (e.g. 5069607) on parl.gc.ca; it'll be fetched and processed",
        metavar="ID",
    )
    optparser.add_option(
        "-l",
        "--language",
        dest="language",
        metavar="[E,F]",
        default="E",
        help="Language of the document to download. Only necessary if alpheus is downloading from parl.gc.ca.",
    )

    group = optparse.OptionGroup(optparser, "Debugging Options")
    group.add_option(
        "--print-names",
        dest="print_names",
        action="store_true",
        help="Instead of outputting HTML, print a list of names of people speaking.",
    )
    group.add_option("--pdb", dest="pdb", action="store_true", help="Drop into the Python debugger on exception")
    optparser.add_option_group(group)

    (options, args) = optparser.parse_args()
    try:
        if options.filename:
            document = parse_file(open(options.filename))
        elif options.docid:
            document = fetch_and_parse(options.docid, options.language[0].upper())
        else:
            document = parse_file(sys.stdin)
    except Exception as e:
        if options.pdb:
            import pdb

            pdb.post_mortem()
        else:
            raise
    # sys.stderr.write("Parsed %d statements\n" % len(document.statements))
    if options.print_names:
        for s in document.statements:
            print s.meta.get("person_attribution", "").encode("utf8")
    else:
        html = document.as_html()
        print html.encode("utf8")
Exemplo n.º 3
0
    def render_GET(self, request):
        try:
            style = get_style_by_name(self.style_name)
        except ClassNotFound:
            style = get_style_by_name('default')
            self.style_name = 'default'

        prev_url = None
        if self.days_back:
            prev_url = self.url_for(request, self.days_back - 1)
        next_url = self.url_for(request, (self.days_back or 0) + 1)
        formatter = LogFormatter(style=style)

        if self.days_back:
            log_date = date.today() - timedelta(self.days_back)
            suffix = log_date.strftime('.%Y_%m_%d').replace('_0', '_')
            self.logfilename += suffix
        try:
            with codecs.open(self.logfilename, 'r', 'utf-8') as logfile:
                html = self.render_log(logfile.read(), formatter,
                                       prev_url, next_url)
        except IOError:
            request.setResponseCode(404)
            return '<html><body>Go away.</body></html>'
        request.setHeader('Content-Type', 'text/html;charset=utf-8')
        return html.encode('utf-8')
Exemplo n.º 4
0
def convert_html_to_markdown(html):
    # type: (Text) -> Text
    # On Linux, the tool installs as html2markdown, and there's a command called
    # html2text that does something totally different. On OSX, the tool installs
    # as html2text.
    commands = ["html2markdown", "html2text"]

    for command in commands:
        try:
            # A body width of 0 means do not try to wrap the text for us.
            p = subprocess.Popen(
                [command, "--body-width=0"], stdout=subprocess.PIPE,
                stdin=subprocess.PIPE, stderr=subprocess.STDOUT)
            break
        except OSError:
            continue

    markdown = p.communicate(input=html.encode('utf-8'))[0].decode('utf-8').strip()
    # We want images to get linked and inline previewed, but html2text will turn
    # them into links of the form `![](http://foo.com/image.png)`, which is
    # ugly. Run a regex over the resulting description, turning links of the
    # form `![](http://foo.com/image.png?12345)` into
    # `[image.png](http://foo.com/image.png)`.
    return re.sub(u"!\\[\\]\\((\\S*)/(\\S*)\\?(\\S*)\\)",
                  u"[\\2](\\1/\\2)", markdown)
Exemplo n.º 5
0
def parse_usage(html):
    """
    Extract the usage integers out of the summary HTML and return them as a
    dictionary with keys 'minutes', 'texts', 'megabytes'.
    """
    with open('test.html', 'w') as f:
        f.write(html.encode('utf-8'))
    lxml_root = lxml.html.fromstring(html)
    fields = OrderedDict([
        ('minutes', ("//*[contains(text(), 'minutes used')]/"
                     "preceding-sibling::strong/text()", int)),

        ('texts', ("//*[contains(text(), 'texts used')]/"
                   "preceding-sibling::strong/text()", int)),

        ('megabytes', ("//*[contains(text(), 'MB data used')]/"
                       "preceding-sibling::strong/text()", int)),
    ])
    data = {}
    for field, (xpath, convert_function) in fields.items():
        logging.debug(xpath)
        matching_elements = lxml_root.xpath(xpath)
        assert len(matching_elements) == 1
        value = matching_elements[0]

        if convert_function:
            value = convert_function(value)
        data[field] = value
    return data
Exemplo n.º 6
0
def parse_speeches(i,url,html, response_status):
    record = {}
    record['old_url'] = url
    record['i'] = i
    if response_status == 404: #don't bother with the 404's
        record['status'] = 404
        return record
    root = lxml.html.fromstring(html.encode('iso-8859-1'))
    record['summary'] = root.cssselect('meta[name="DC.description"]')[0].get('content') #summary from metatag
    contentdiv = root.cssselect("div#content")
    if not contentdiv:
        return False
    content = contentdiv[0]
    if not content:
        return False
    titles = list(content.cssselect("div.hgroup h1"))
    if titles:
        record['title'] = titles[0].text_content().strip()
    meta = content.cssselect("table.meta")[0]
    try:
        record['associated_organisations'] = meta.xpath('//*[contains(text(), "Publisher:")]/following-sibling::*')[0].text_content() #associate organisations
    except:
        pass
    try:
        record['delivered_by'] = meta.xpath('//*[contains(text(), "Delivered by:")]/following-sibling::*')[0].text_content() #delivered by
    except:
        pass
    try:
        record['delivered_on_date'] = meta.xpath('//*[contains(text(), "Delivered date:")]/following-sibling::*')[0].text_content().strip() # delivered on date
    except:
        pass
    try:
        record['speech_type'] = meta.xpath('//*[contains(text(), "Type:")]/following-sibling::*')[0].text_content().strip() # speech type
    except:
        pass
    try:
        record['event'] = meta.xpath('//*[contains(text(), "Event:")]/following-sibling::*')[0].text_content().strip() # event
    except:
        pass
    try:
        record['location'] = meta.xpath('//*[contains(text(), "Location:")]/following-sibling::*')[0].text_content().strip() # location
    except:
        pass
    if 'event' in record and 'location' in record:
        record['event_and_location'] = record['event'] + ', ' + record['location'] # event + location 
    try:    
        record['date'] = dateutil.parser.parse(record["delivered_on_date"], dayfirst=True).date().isoformat() #iso date
    except:
        pass
    try:
        record['associated_policies'] = meta.xpath('//*[contains(text(), "Mode/topic:")]/following-sibling::*')[0].text_content() #associate policies
    except:
        pass    
    for node in content.cssselect("div.header"): #drop the header info - we are done with it and don't want it in the body text
        node.drop_tree()
    record['body'] = html2text.HTML2Text().handle(data=lxml.html.tostring(content)) #bodytext
    record['body'] = record['body'].replace(u"\xa0", u" ") #non breaking spaces
    record['body'] = record['body'].encode('utf-8')
    return record
Exemplo n.º 7
0
 def fromstring(self, html):
     html = encodeValue(html)
     try:
         self.doc = lxml.html.fromstring(html)
     except:
         html = html.encode('utf-8','replace')
         self.doc = lxml.html.fromstring(html)
     return self.doc
Exemplo n.º 8
0
 def fromstring(self, html):
     html = encodeValue(html)
     try:
         self.doc = lxml.html.fromstring(html)
     except:
         html = html.encode("ascii", "replace")
         self.doc = lxml.html.fromstring(html)
     return self.doc
Exemplo n.º 9
0
def searchGoogle(driver, query):
    query_decoded=query.decode("utf-8")
    #print query_decoded
    driver.get("http://google.ru/search?" + urlencode({'q': query}))
    time.sleep(2)
    html = driver.page_source
    html=html.encode("utf-8")
    return html
Exemplo n.º 10
0
def markdownify_html2text(html):

    p = subprocess.Popen(['html2text', '-d', '-b', '0', ],
        stdin=subprocess.PIPE,
        stdout=subprocess.PIPE)
    stdout, stderr = p.communicate(input=html.encode('utf-8'))

    return stdout
Exemplo n.º 11
0
 def fromstring(cls, html, original_encoding='utf-8'):
     html = encodeValue(html, encoding=original_encoding)
     try:
         parser = lxml.html.HTMLParser(encoding=original_encoding)
         cls.doc = lxml.html.fromstring(html.encode(original_encoding), parser=parser)
     except Exception, e:
         print '[Parse lxml ERR]', str(e)
         return None
Exemplo n.º 12
0
    def clean_html(cls, html, encoding=None):
        parser = lxml.html.HTMLParser(encoding=encoding)

        if isinstance(html, unicode) and encoding is not None:
            html = html.encode(encoding)

        html = lxml.html.document_fromstring(html, parser=parser)
        return _cleaner.clean_html(html)
Exemplo n.º 13
0
def unicodeToStr(html, encoding='utf-8'):
    if not isinstance(html, unicode):
        decoding, charJust = '', chardet.detect(html)
        try: decoding = 'gbk' if charJust['encoding'].lower() == 'gb2312' else charJust['encoding']
        except Exception, e: print 'unicodeToStr chardet detect error:', Exception, '->', e
        if encoding and decoding and decoding!=encoding : html = html.decode(decoding, 'ignore').encode(encoding, 'ignore')
    else:
        if encoding: html = html.encode(encoding, 'ignore')
    return html
Exemplo n.º 14
0
    def tidy (html):
        """ Pipe html thru w3c tidy. """

        html = parsers.RE_RESTRICTED.sub ('', html)
        html = RE_XMLDECL.sub ('', html)
        html = parsers.RE_HTML_CHARSET.sub ('; charset=utf-8', html)

        # convert to xhtml
        tidy = subprocess.Popen (
            ["tidy",
             "-utf8",
             "-clean",
             "--wrap",             "0",
             # "--drop-font-tags",   "y",
             # "--drop-proprietary-attributes", "y",
             # "--add-xml-space",    "y",
             "--output-xhtml",     "y",
             "--numeric-entities", "y",
             "--merge-divs",       "n", # keep poetry indentation
             "--merge-spans",      "n",
             "--add-xml-decl",     "n",
             "--doctype",          "strict",
             "--anchor-as-name",   "n",
             "--enclose-text",     "y" ],

            stdin = subprocess.PIPE,
            stdout = subprocess.PIPE,
            stderr = subprocess.PIPE)

        # print (html.encode ('utf-8'))
        # sys.exit ()

        (html, stderr) = tidy.communicate (html.encode ('utf-8'))

        regex = re.compile ('(Info:|Warning:|Error:)\s*', re.I)

        # pylint: disable=E1103
        msg = stderr.rstrip ()
        for line in msg.splitlines ():
            match = regex.search (line)
            if match:
                sline = regex.sub ("", line)
                g = match.group (1).lower ()
                if g == 'info:':
                    info ("tidy: %s" % sline)
                elif g == 'warning:':
                    warn ("tidy: %s" % sline)
                elif g == 'error:':
                    error ("tidy: %s" % sline)
                else:
                    error (line)

        if tidy.returncode == 2:
            raise ValueError, stderr

        return html.decode ('utf-8')
Exemplo n.º 15
0
def cache_results(search_params, html):
    """Stores a html resource as a file in scrapecache/fname.cache

    This will always write(overwrite) the cache file.
    """
    fname = cached_file_name(search_params)

    with open(os.path.join(CACHEDIR, fname), 'w') as fd:
        #TODO see encoding in detail.
        fd.write(html.encode('utf8'))
Exemplo n.º 16
0
 def fromstring(self, html):
     # html = normalize_spaces(html)
     html = clean_attributes(html)
     self.doc = lxml.html.fromstring(html.encode('utf-8'))
     # TODO: 查看哪个正确率更高
     # from lxml.html import html5parser
     # import lxml.html
     # html5doc = html5parser.document_fromstring(html.encode('utf-8'))
     # self.doc = lxml.html.fromstring(self.nodeToString(html5doc))
     return self.doc
def send_email(from_email, to_email_list, subject, html, smtp_host, smtp_port=587, username=None, password=None):
    message = Message(From=from_email, To=to_email_list, charset='utf-8')
    # Keep from creating threads in gmail...
    message.Subject = "{} -- {}".format(subject, datetime.now().strftime('%Y-%m-%dT%H:%M:%S'))
    message.Html = html.encode('utf-8')
    message.Body = 'See the HTML!'

    sender = Mailer(host=smtp_host, port=smtp_port, use_tls=True, usr=username, pwd=password)
    if username is not None:
        sender.login(username, password)
    sender.send(message)
Exemplo n.º 18
0
def parse_news(i,url,html, response_status):
    record = {}
    record['old_url'] = url
    record['i'] = i
    if response_status == 404: #don't bother with the 404's
        record['status'] = 404
        return record
    root = lxml.html.fromstring(html.encode('iso-8859-1'))
    record['summary'] = root.cssselect('meta[name="DC.description"]')[0].get('content') #summary from metatag
    contentdiv = root.cssselect("div#content")
    if not contentdiv:
        return False
    content = contentdiv[0]
    if not content:
        return False
    titles = list(content.cssselect("div.hgroup h1"))
    if titles:
        record['title'] = titles[0].text_content().strip() #stripped title
    meta = content.cssselect("table.meta")[0]
    try:
        record['associated_organisations'] = meta.xpath('//*[contains(text(), "Publisher:")]/following-sibling::*')[0].text_content() #associate organisations
    except:
        pass
    try:
        record['associated_policies'] = meta.xpath('//*[contains(text(), "Mode/topic:")]/following-sibling::*')[0].text_content() #associate policies
    except:
        pass
    try:
        record['first_published'] = meta.xpath('//*[contains(text(), "Published date:")]/following-sibling::*')[0].text_content().strip() # first published date
    except:
        pass
    try:
        record['type'] = meta.xpath('//*[contains(text(), "Type:")]/following-sibling::*')[0].text_content().strip() # type
    except:
        pass
    try:    
        record['date'] = dateutil.parser.parse(record["first_published"], dayfirst=True).date().isoformat() #iso date
    except:
        pass
    
    for node in content.cssselect("div.header"): #drop the header info - we are done with it and don't want it in the body text
        node.drop_tree()
    #record['body'] = html2text.HTML2Text().handle(data=lxml.html.tostring(content,encoding=unicode)) #bodytext
    
    #record['body'] = html2text.HTML2Text().handle(data=lxml.html.tostring(content,encoding="ascii")) #bodytext

    #encoding mess:
    record['body'] = html2text.HTML2Text().handle(data=lxml.html.tostring(content)) #bodytext
    record['body'] = record['body'].replace(u"\xa0", u" ") #non breaking spaces
    record['body'] = record['body'].encode('utf-8')
    #print lxml.html.tostring(content,encoding="ascii")
    #print repr(record['body'])
    
    return record
Exemplo n.º 19
0
def doConvert(url):
    # загрузка страницы
    j = urllib.urlopen(url)
    try:
        from feedparser import _getCharacterEncoding as enc
    except ImportError:
        enc = lambda x, y: ('utf-8', 1)
    text = j.read()
    encoding = enc(j.headers, text)[0]
    if encoding == 'us-ascii': encoding = 'cp1251'
    data = text.decode(encoding)
    # конвертирование html документа в markdown
    originalMarkdownDocument = html2text.html2text(data, url)
    markdownDocument = originalMarkdownDocument.split("\n")
    # поиск верхней границы статьи
    title = lxml.html.document_fromstring(text)
    startLine = findStartMerker(title.find(".//title").text, markdownDocument)
    # удаление текста выше верхней границы
    del markdownDocument[:startLine]
    # поиск нижней границы статьи
    skiplist = []
    endLine = findEndMarker(markdownDocument, skiplist)
    # удаление строк из skiplist
    for x in range(len(skiplist)-1,0, -1):
        markdownDocument.pop(skiplist[x])
    # отсечение статьи по нижней границе
    if endLine <> -1:
            del markdownDocument[endLine-len(skiplist)+1:]
    else:
        return;
    # замена ссылок линками
    fragment = listToString(markdownDocument)
    fragment = replaceInternalLinks(originalMarkdownDocument, fragment)
    global htmlOut
    if htmlOut == 1:
        # конвертирование markdown в html
        html = markdown.markdown(fragment)
        print html.encode('utf-8')
    else:
        print fragment.encode('utf-8')
Exemplo n.º 20
0
def process_article(html, full=True, replace=False):
    pos = 0
    src = None
    try:
        soup = BeautifulSoup(html)
    except UnicodeEncodeError:
        soup = BeautifulSoup(html.encode('utf-8', 'ignore'))
    media_found = False
    for tag in soup.find_all(True):
        if any(x == tag.name for x in EXCLUDED_TAGS) \
            or (tag.name == 'div' and 'class' in tag.attrs and any(div in tag.attrs['class'] for div in EXCLUDED_DIV_CLASS))\
            or ((not tag.contents and not tag.name == 'img' and (tag.string is None or not tag.string.strip()))
                or (tag.name == 'img' and 'src' in tag.attrs
                    and any(host in tag['src'] for host in EXCLUDED_IMAGES_HOST)))\
            or (tag.name == 'a' and 'href' in tag.attrs and any(host in tag.attrs['href'] for host in EXCLUDED_A))\
                or isinstance(tag, Comment):
                    if tag.parent and tag.parent.name == 'a':
                        tag.parent.decompose()
                    else:
                        tag.decompose()
                    continue
        for attr in EXCLUDED_ATTR:
            try:
                del tag[attr]
            except AttributeError:
                pass
        if not replace and not media_found and full:
            if tag.name != 'img' and tag.name != 'a' and pos > 12:
                media_found = True
            elif tag.name == 'img' and 'src' in tag.attrs:
                src = tag.attrs['src']
                if src:
                    o = urlparse.urlparse(src)
                    src = o.scheme + "://" + o.netloc + o.path
                if tag.parent and tag.parent.name == 'a':
                    tag.parent.decompose()
                else:
                    tag.decompose()
                media_found = True
            pos += 1
        if replace:
            if tag.name == 'img' and 'src' in tag.attrs and tag.attrs['src'] == replace:
                if tag.parent and tag.parent.name == 'a':
                    tag.parent.decompose()
                else:
                    tag.decompose()
    content = unicode(soup)
    if full:
        excerpt = (strip_tags(content)).strip()
        return {'content': content, 'image': src, 'word_count': len(excerpt.split()), 'excerpt': excerpt}
    else:
        return {'content': content, 'image': src}
Exemplo n.º 21
0
    def __bytes__(self):
        # try to make sure summary is wrapped in a tag
        summary = self.document.metadata['summary']
        try:
            etree.fromstring(summary)
            html = '{}'.format(summary)
        except etree.XMLSyntaxError:
            html = """\
<div class="description" data-type="description"\
 xmlns="http://www.w3.org/1999/xhtml">
  {}
</div>""".format(summary)
        return html.encode('utf-8')
Exemplo n.º 22
0
    def extract(self, html, link):
        (title, body) = readability_extract(html)

        document = lxml.html.fromstring(html.encode('utf-8'))
        date_cells = document.cssselect('td.createdate')
        date = date_cells[0].text_content().strip() if len(date_cells) == 1 else None
        doc = {
            'url': link,
            'title': title,
            'text': body,
            'date': parse(date),
            'source': 'ACGA News & Views'
        }
        return doc
Exemplo n.º 23
0
def load_html(tree_or_html, base_url=None):
    """
    Parse HTML data to a lxml tree.
    ``tree_or_html`` must be either unicode or utf8-encoded
    (even if original page declares a different encoding).

    If ``tree_or_html`` is not a string then it is returned as-is.
    """
    if not isinstance(tree_or_html, (six.string_types, bytes)):
        return tree_or_html

    html = tree_or_html
    if isinstance(html, six.text_type):
        html = html.encode('utf8')
    return lxml.html.fromstring(html, base_url=base_url, parser=parser)
Exemplo n.º 24
0
def Consultations():
    #scraperwiki.sqlite.execute("drop table if exists consultations")
    scraperwiki.sqlite.execute("create table if not exists consultations (old_url text, i integer)")
    hurlbatch = scraperwiki.sqlite.execute("select xllinks.i, xllinks.url, html, htmlcache.status from xllinks left join htmlcache on htmlcache.url=xllinks.url left join consultations on consultations.i=xllinks.i where xllinks.sheetname='Consultations' and consultations.old_url is null and htmlcache.url is not null limit 20")
    ldata = [ ]
    print "fetched batch", len(hurlbatch["data"])
    for i, url, html, response_status in hurlbatch["data"]:        
        data = parse_consultations(i, url, html.encode('latin-1'), response_status)
        if data:
            ldata.append(data)
            print data
        else:
            print "Failed to parse", url, html
    scraperwiki.sqlite.save(["i"], ldata, "consultations")
    return len(ldata)
Exemplo n.º 25
0
    def parse(self, html=None):
        """Public function to start parsing the search engine results.

        Args:
            html: The raw html data to extract the SERP entries from.
        """
        if html:
            self.html = html.encode('utf-8').decode('utf-8')

        # lets do the actual parsing
        self._parse()

        # Apply subclass specific behaviour after parsing has happened
        # This is needed because different parsers need to clean/modify
        # the parsed data uniquely.
        self.after_parsing()
Exemplo n.º 26
0
def getData(html, course):
    root = lxml.html.fromstring(html.encode('utf-8'))

    sect = root.find_class('section main')

    # Section Allgemeines entfernen
    sect.pop(0)

    sections = []

    # all sections
    for sec in sect:
        date = sec[0].text_content()
        kw = parseDateToKW(date)

        assignments = []
        scripts = []
        others = []

        for docs in sec.find_class('activityinstance'):

            #Spezialfälle falls man nicht auf link zugreifen kann
            if (not (docs.xpath("a"))):
                continue

            link = (docs.xpath("a"))[0].get('href')
            name_of_file = docs.find_class('instancename')[0].text

            nof = removeUmlaut(name_of_file)

            if re.match(course['pattern_script'], name_of_file):
                scripts.append(((nof, link)))

            elif re.match(course['pattern_assignment'], name_of_file):
                assignments.append((nof, link))

            # Alle anderen Links und auch splashes
            else:
                others.append((nof, link))

        if (not assignments) and (not scripts) and (not others):
            continue
        else:
            sections.append(
                Processed_Section(date, kw, assignments, scripts, others))

    return sections
Exemplo n.º 27
0
def html_to_dom(html, default_encoding=DEFAULT_ENCODING, encoding=None, errors=DEFAULT_ENC_ERRORS):
    """Converts HTML to DOM."""
    if isinstance(html, unicode):
        decoded_html = html
        # encode HTML for case it's XML with encoding declaration
        forced_encoding = encoding if encoding else default_encoding
        html = html.encode(forced_encoding, errors)
    else:
        decoded_html = decode_html(html, default_encoding, encoding, errors)

    try:
        dom = lxml.html.fromstring(decoded_html, parser=lxml.html.HTMLParser())
    except ValueError:
        # Unicode strings with encoding declaration are not supported.
        # for XHTML files with encoding declaration, use the declared encoding
        dom = lxml.html.fromstring(html, parser=lxml.html.HTMLParser())

    return dom
Exemplo n.º 28
0
def searchGoogle(driver, query):
    query_decoded=query.decode("utf-8")
    #print query_decoded
    driver.get("http://google.ru/search?" + urlencode({'q': query}))
    #inputElement = driver.find_element_by_id("lst-ib")
    #inputElement.send_keys(query_decoded)
    #inputElement.submit()
    time.sleep(2)
    html = driver.page_source
    html=html.encode("utf-8")
    #f = open(query + '.html', 'w')
    #f.write(html)
    #f.close()
    #check if yandex banned query then std print error and continue
#    if (("Нам очень жаль, но запросы, поступившие" in html) or ("Введите, пожалуйста, символы с картинки в поле ввода" in html)):
#        print 'Yandex banned: ' + query_decoded + '\n'
#        raise
    return html
Exemplo n.º 29
0
def html_to_dom(html, default_encoding=DEFAULT_ENCODING, encoding=None, errors=DEFAULT_ENC_ERRORS):
    """Converts HTML to DOM."""
    if isinstance(html, unicode):
        decoded_html = html
        # encode HTML for case it's XML with encoding declaration
        forced_encoding = encoding if encoding else default_encoding
        html = html.encode(forced_encoding, errors)
    else:
        decoded_html = decode_html(html, default_encoding, encoding, errors)

    try:
        dom = lxml.html.fromstring(decoded_html, parser=lxml.html.HTMLParser())
    except ValueError:
        # Unicode strings with encoding declaration are not supported.
        # for XHTML files with encoding declaration, use the declared encoding
        dom = lxml.html.fromstring(html, parser=lxml.html.HTMLParser())

    return dom
Exemplo n.º 30
0
def Consultations():
    #scraperwiki.sqlite.execute("drop table if exists consultations")
    scraperwiki.sqlite.execute(
        "create table if not exists consultations (old_url text, i integer)")
    hurlbatch = scraperwiki.sqlite.execute(
        "select xllinks.i, xllinks.url, html, htmlcache.status from xllinks left join htmlcache on htmlcache.url=xllinks.url left join consultations on consultations.i=xllinks.i where xllinks.sheetname='Consultations' and consultations.old_url is null and htmlcache.url is not null limit 20"
    )
    ldata = []
    print "fetched batch", len(hurlbatch["data"])
    for i, url, html, response_status in hurlbatch["data"]:
        data = parse_consultations(i, url, html.encode('latin-1'),
                                   response_status)
        if data:
            ldata.append(data)
            print data
        else:
            print "Failed to parse", url, html
    scraperwiki.sqlite.save(["i"], ldata, "consultations")
    return len(ldata)
Exemplo n.º 31
0
def html_to_dom(html,
                default_encoding=DEFAULT_ENCODING,
                encoding=None,
                errors=DEFAULT_ENC_ERRORS):
    """Converts HTML to DOM."""
    if isinstance(html, unicode):
        decoded_html = html
        # encode HTML for case it's XML with encoding declaration
        forced_encoding = encoding if encoding else default_encoding
        html = html.encode(forced_encoding, errors)
    else:
        decoded_html = decode_html(html, default_encoding, encoding, errors)

    try:
        dom = lxml.html.fromstring(decoded_html)
    except ValueError:
        dom = lxml.html.fromstring(html)

    return dom
Exemplo n.º 32
0
def remove_control_characters(html):
    # type: (t.Text) -> t.Text
    """
    Strip invalid XML characters that `lxml` cannot parse.
    """
    # See: https://github.com/html5lib/html5lib-python/issues/96
    #
    # The XML 1.0 spec defines the valid character range as:
    # Char ::= #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
    #
    # We can instead match the invalid characters by inverting that range into:
    # InvalidChar ::= #xb | #xc | #xFFFE | #xFFFF | [#x0-#x8] | [#xe-#x1F] | [#xD800-#xDFFF]
    #
    # Sources:
    # https://www.w3.org/TR/REC-xml/#charsets,
    # https://lsimons.wordpress.com/2011/03/17/stripping-illegal-characters-out-of-xml-in-python/
    def strip_illegal_xml_characters(s, default, base=10):
        # Compare the "invalid XML character range" numerically
        n = int(s, base)
        if (
            n in (0xB, 0xC, 0xFFFE, 0xFFFF)
            or 0x0 <= n <= 0x8
            or 0xE <= n <= 0x1F
            or 0xD800 <= n <= 0xDFFF
        ):
            return ""
        return default

    # We encode all non-ascii characters to XML char-refs, so for example "💖" becomes: "&#x1F496;"
    # Otherwise we'd remove emojis by mistake on narrow-unicode builds of Python
    html = html.encode("ascii", "xmlcharrefreplace").decode("utf-8")
    html = re.sub(
        r"&#(\d+);?", lambda c: strip_illegal_xml_characters(c.group(1), c.group(0)), html
    )
    html = re.sub(
        r"&#[xX]([0-9a-fA-F]+);?",
        lambda c: strip_illegal_xml_characters(c.group(1), c.group(0), base=16),
        html,
    )
    # A regex matching the "invalid XML character range"
    html = re.compile(r"[\x00-\x08\x0b\x0c\x0e-\x1F\uD800-\uDFFF\uFFFE\uFFFF]").sub("", html)
    return html
Exemplo n.º 33
0
def decompress_descriptions(encoding='utf-8'):
    """Convert parquet to tarfile"""

    pf = pq.ParquetFile(YAHOO_PARQUET)

    progress = tqdm(file=sys.stdout, disable=False)

    with tarfile.open(YAHOO_ARCH, 'w:bz2') as archive:
        for i in range(pf.metadata.num_row_groups):
            table = pf.read_row_group(i)
            columns = table.to_pydict()
            for symbol, html in zip(columns['symbol'], columns['html']):
                bytes = html.encode(encoding)
                s = io.BytesIO(bytes)
                tarinfo = tarfile.TarInfo(name=f'yahoo/{symbol}.html')
                tarinfo.size = len(bytes)
                archive.addfile(tarinfo=tarinfo, fileobj=s)
                progress.update(1)

    progress.close()
Exemplo n.º 34
0
def sync_oss():
    mongo = MongoSpider(conf.mongo_spider)
    with open(conf.data_root + os.sep + 'filter.json') as fd:
        data = fd.read()
    feeds = json.loads(data)
    feeds = dict(filter(lambda x: x[1]['status'] == 'good', feeds.iteritems()))
    for key, feed in feeds.iteritems():
        try:
            doc = {
                'rss_id': feed['_id'],
                'html': {
                    '$exists': True,
                    '$ne': ''
                },
                'oss': {
                    '$ne': 'success'
                }
            }
            success, error = 0, 0
            for article in mongo.article.find(doc, timeout=False):
                html = mongo.file.get(article['html'])
                html = html.encode('utf-8')
                if html and mongo.oss.put(article['html'], html):
                    mongo.article.update({'_id': article['_id']},
                                         {'$set': {
                                             'oss': 'success'
                                         }})
                    print article['title'], 'OK'
                    success += 1
                else:
                    mongo.article.update({'_id': article['_id']},
                                         {'$set': {
                                             'oss': 'error'
                                         }})
                    print article['title'], 'Error'
                    error += 1
            print key, success + error, success, error, feed['url']
        except KeyboardInterrupt, e:
            break
        except Exception, e:
            print e
Exemplo n.º 35
0
def add_text():
    url = request.json["url"]
    print("url", url)

    if any([y in url for y in blocklist]):
        print("blocked", [y for y in blocklist if y in url])
        return jsonify({})
    html = request.json["html"]
    html = lxml.html.tostring(lxml.html.fromstring(html.encode("utf8")))

    tree = make_tree(html, url)

    html = lxml.html.tostring(tree).decode("utf8")

    slugged_url = slug_url(url)

    t1 = time.time()
    # meta_path = BASE_PATH / "meta/v1/{}_{}.json".format(t1, slugged_url)
    # try:
    #     article = parse_article(html, url)
    #     metadata = article.to_dict(keys=ARTICLE_KEYS_TO_KEEP, skip_if_empty=True)
    # except Exception as e:
    #     metadata = {"error": str(e)}
    # metadata["creation_time"] = t1
    # metadata["slugged_url"] = slugged_url
    # with open(meta_path, "w") as f:
    #     json.dump(metadata, f, indent=4)
    # just.write(metadata, meta_path)

    html_path = BASE_PATH + "html/{}_{}.html.gz".format(t1, slugged_url)
    print("html_path", html_path)
    just.write(html, html_path)

    obj = {"path": str(html_path), "url": url, "time": str(time.time())}
    print("META_PATH", META_PATH)
    just.append(obj, META_PATH)

    last.append(html)
    last_urls.append(url)
    print("saved", url)
    return jsonify({"urls": list(last_urls)})
Exemplo n.º 36
0
def get_html_tree(html):
    """
    Given the HTML string, returns a LXML tree object. The tree is wrapped in
    <div> elements if it doesn't have a top level tag or parsing would
    otherwise result in an error. The wrapping can be later removed with
    strip_wrapping().
    """

    parser = lxml.html.HTMLParser(encoding='utf-8')
    html = html.encode('utf8')

    try:
        tree = lxml.html.fromstring(html, parser=parser)
    except lxml.etree.Error:
        # E.g. empty document. Use dummy <div>
        tree = lxml.html.fromstring('<div></div>')

    # If the document doesn't start with a top level tag, wrap it with a <div>
    # that will be later stripped out for consistent behavior.
    if tree.tag not in lxml.html.defs.top_level_tags:
        html = b'<div>%s</div>' % html
        tree = lxml.html.fromstring(html, parser=parser)

    # HACK for Outlook emails, where tags like <o:p> are rendered as <p>. We
    # can generally ignore these tags so we replace them with <span>, which
    # doesn't cause a line break. Also, we can't look up the element path of
    # tags that contain colons. When rendering the tree, we will restore the
    # tag name.
    for el in tree.iter():
        if el.nsmap or (isinstance(el.tag, string_class) and ':' in el.tag):
            if el.nsmap:
                actual_tag_name = '{}:{}'.format(
                    list(el.nsmap.keys())[0], el.tag)
            else:
                actual_tag_name = el.tag
            el.tag = 'span'
            el.attrib['__tag_name'] = actual_tag_name

    return tree
Exemplo n.º 37
0
    def test_image_data_links_in_style(self):
        data = b'123'
        data_b64 = base64.b64encode(data).decode('ASCII')
        urls = [
            "data:image/jpeg;base64," + data_b64,
            "data:image/apng;base64," + data_b64,
            "data:image/png;base64," + data_b64,
            "data:image/gif;base64," + data_b64,
            "data:image/webp;base64," + data_b64,
            "data:image/bmp;base64," + data_b64,
            "data:image/tiff;base64," + data_b64,
            "data:image/x-icon;base64," + data_b64,
        ]
        for url in urls:
            html = '<style> url(%s) </style>' % url
            s = lxml.html.fragment_fromstring(html)

            cleaned = lxml.html.tostring(clean_html(s))
            self.assertEqual(
                html.encode("UTF-8"),
                cleaned,
                "%s  ->  %s" % (url, cleaned))
Exemplo n.º 38
0
def get_html_tree(html):
    """
    Given the HTML string, returns a LXML tree object. The tree is wrapped in
    <div> elements if it doesn't have a top level tag or parsing would
    otherwise result in an error. The wrapping can be later removed with
    strip_wrapping().
    """

    parser = lxml.html.HTMLParser(encoding='utf-8')
    html = html.encode('utf8')

    try:
        tree = lxml.html.fromstring(html, parser=parser)
    except lxml.etree.Error:
        # E.g. empty document. Use dummy <div>
        tree = lxml.html.fromstring('<div></div>')

    # If the document doesn't start with a top level tag, wrap it with a <div>
    # that will be later stripped out for consistent behavior.
    if tree.tag not in lxml.html.defs.top_level_tags:
        html = b'<div>%s</div>' % html
        tree = lxml.html.fromstring(html, parser=parser)

    # HACK for Outlook emails, where tags like <o:p> are rendered as <p>. We
    # can generally ignore these tags so we replace them with <span>, which
    # doesn't cause a line break. Also, we can't look up the element path of
    # tags that contain colons. When rendering the tree, we will restore the
    # tag name.
    for el in tree.iter():
        if el.nsmap or (isinstance(el.tag, string_class) and ':' in el.tag):
            if el.nsmap:
                actual_tag_name = '{}:{}'.format(list(el.nsmap.keys())[0], el.tag)
            else:
                actual_tag_name = el.tag
            el.tag = 'span'
            el.attrib['__tag_name'] = actual_tag_name

    return tree
Exemplo n.º 39
0
def html2dita_saxon(html, infotype='topic'):

    if not isinstance(html, six.text_type):
        raise TypeError('HTML must be str/unicode')

    html_out, errors = tidylib.tidy_document(
    html.encode('utf8'),
    options={
        'doctype': 'omit',
        'output_xhtml': 1,
        'input-encoding': 'utf8',
        'output-encoding': 'utf8',
        'char-encoding': 'utf8',
    })
    html_out = html_out.replace(b' xmlns="http://www.w3.org/1999/xhtml"', b'')

    html_filename = tempfile.mktemp(suffix='.html')
    with io.open(html_filename, 'wb') as fp:
        fp.write(html_out)

    output_filename = tempfile.mktemp(suffix='.html')
    cmd = '"{saxon}" "{html_filename}" "{h2d_xsl}" infotype={infotype} >"{output_filename}"'.format(
            saxon=saxon,
            html_filename=html_filename,
            h2d_xsl=h2d_xsl,
            infotype=infotype,
            output_filename=output_filename)

    status, output = util.runcmd(cmd)
    if status != 0:
        raise RuntimeError('html2dita() failed: {}'.format(output))

    with io.open(output_filename, 'r') as fp:
        topic_out = fp.read()

    os.unlink(html_filename)
    os.unlink(output_filename)
    return topic_out
Exemplo n.º 40
0
def extractHtml(html, selector, type='css', dump=False):
    items = []

    if html != '':
        try:
            soup = lxml.html.fromstring(html.encode('utf-8'))

            if type == 'css':
                for item in soup.cssselect(selector):
                    item = lxml.etree.tostring(item).decode('utf-8').strip()
                    items.append(item)
            elif type == 'xpath':
                result = soup.xpath(selector)
                result = result if isinstance(result, list) else [result]
                for item in result:
                    if isinstance(item, lxml.etree._Element):
                        item = lxml.etree.tostring(item).decode('utf-8')
                    items.append(str(item).strip())

        except Exception as e:
            items.append('ERROR: ' + str(e))

    return items
Exemplo n.º 41
0
def convert_html_to_markdown(html: str) -> str:
    # On Linux, the tool installs as html2markdown, and there's a command called
    # html2text that does something totally different. On OSX, the tool installs
    # as html2text.
    commands = ["html2markdown", "html2text"]

    for command in commands:
        try:
            # A body width of 0 means do not try to wrap the text for us.
            p = subprocess.Popen(
                [command, "--body-width=0"], stdout=subprocess.PIPE,
                stdin=subprocess.PIPE, stderr=subprocess.STDOUT)
            break
        except OSError:
            continue

    markdown = p.communicate(input=html.encode('utf-8'))[0].decode('utf-8').strip()
    # We want images to get linked and inline previewed, but html2text will turn
    # them into links of the form `![](http://foo.com/image.png)`, which is
    # ugly. Run a regex over the resulting description, turning links of the
    # form `![](http://foo.com/image.png?12345)` into
    # `[image.png](http://foo.com/image.png)`.
    return re.sub("!\\[\\]\\((\\S*)/(\\S*)\\?(\\S*)\\)",
                  "[\\2](\\1/\\2)", markdown)
Exemplo n.º 42
0
    def to_xhtml (self, html, base_url):
        html = html.replace (u'&nbsp;', u' ')
        html = html.replace (u'&mdash;', u'—')

        outputfilename = os.path.join (options.outputdir, options.outputfile)
        debugfilename = os.path.splitext (outputfilename)[0] + '.debug.html'

        try:
            os.remove (debugfilename)
        except OSError:
            pass
        
        if options.verbose > 1:
            with open (debugfilename, 'w') as fp:
                fp.write (html.encode ('utf-8'))

        try:
            xhtml = etree.fromstring (
                html, 
                lxml.html.XHTMLParser (),
                base_url = base_url)                                           
        except etree.ParseError, what:
            error ("etree.fromstring says %s" % what)
            raise
Exemplo n.º 43
0
variation_template = config.VAR_TEMPLATE
variation_dict = {
    "variation_table": variant_table.decode('utf-8'),
    "footer": footer.decode('utf-8'),
    "gwas_table": gwas_table.decode('utf-8'),
    "consequence_table": VEP_table.decode('utf-8'),
    "regulation": regulatory_table.decode('utf-8'),
    "Genomes_freq": population_table.decode('utf-8'),
    "ExAC_freq": exac_table.decode('utf-8'),
    "UK10K_freq": uk10K_table.decode('utf-8'),
    "genes": gene_table.decode('utf-8'),
    'GTExGenes': GTEx_genes_table.decode('utf-8'),
    "pubmed": pubmed_table.decode('utf-8'),
    "phenotypes": phenotype_table.decode('utf-8')
}

html = draw_html(variation_template, variation_dict)

# Saving file:
f = open(filename, 'w')
f.write(html.encode("utf8"))

print >> sys.stderr, "Done."

print >> sys.stderr, "Annotating genes... "
for dist in gene_list.keys():
    for gene in gene_list[dist]:
        print >> sys.stderr, "\tAnnotating %s... " % gene["ID"],
        Annotate_gene(gene["ID"])
        print >> sys.stderr, "Done."
Exemplo n.º 44
0
def parse_publications(i,url,html, response_status):
    record = {}
    record['old_url'] = url
    record['i'] = i
    if response_status == 404: #don't bother with the 404's
        record['status'] = 404
        return record
    root = lxml.html.fromstring(html.encode('iso-8859-1'))
    record['summary'] = root.cssselect('meta[name="DC.description"]')[0].get('content') #summary from metatag
    record['URN'] = root.cssselect('meta[name="DC.identifier"][scheme="ISBN"]')[0].get('content') #ISBN from metatag
    record['ISBN'] = record['URN']
    record['command_paper_number'] = record['URN']
    contentdiv = root.cssselect("div#content")
    if not contentdiv:
        return False
    content = contentdiv[0]
    if not content:
        return False
    titles = list(content.cssselect("div.hgroup h1"))
    if titles:
        record['title'] = titles[0].text_content().strip()
    meta = content.cssselect("table.meta")[0]
    try:
        record['associated_policies'] = meta.xpath('//*[contains(text(), "Mode/topic:")]/following-sibling::*')[0].text_content() #associated policies
    except:
        pass
    try:
        record['associated_organisations'] = meta.xpath('//*[contains(text(), "Publisher:")]/following-sibling::*')[0].text_content() #associate organisations
    except:
        pass
    try:
        record['publication_date'] = meta.xpath('//*[contains(text(), "Published date:")]/following-sibling::*')[0].text_content() #delivered by
    except:
        pass
    try:    
        record['publication_date_iso'] = dateutil.parser.parse(record["publication_date"], dayfirst=True).date().isoformat() #iso date
    except:
        pass
    

    for node in content.cssselect("div.header"): #drop the header info - we are done with it and don't want it in the body text
        node.drop_tree()
    
    links = content.cssselect('a')
    if len(links) > 0: #put the page(s) with large number of attachments into a json field
        n = 1
        attachment_json = []
        for link in links:
            try:
                if 'assets.dft.gov.uk' in link.attrib['href'] or 'webarchive.nationalarchives.gov.uk' in link.attrib['href']:
                    attachment_json.append({'link': link.attrib['href'], 'title': link.text_content()})
                    n = n+1
            except:
                pass
            try:
                if 'tsoshop.co.uk' in link.attrib['href']:
                    record['order_url'] = link.attrib['href']
            except:
                pass
        record['manual'] = 1
        record['z'] = attachment_json
    else: #process the attachments
        n = 1
        for link in links:
            try:
                if 'assets.dft.gov.uk' in link.attrib['href'] or 'webarchive.nationalarchives.gov.uk' in link.attrib['href']:
                    record['attachment_'+str(n)] = link.attrib['href']
                    record['attachment_'+str(n)+'_title'] = link.text_content()
                    n = n+1
            except:
                pass
            try:
                if 'tsoshop.co.uk' in link.attrib['href']:
                    record['order_url'] = link.attrib['href']
            except:
                pass
    record['body'] = html2text.HTML2Text().handle(data=lxml.html.tostring(content)) #bodytext
    record['body'] = record['body'].replace(u"\xa0", u" ") #non breaking spaces
    record['body'] = record['body'].encode('utf-8')

    
    return record
Exemplo n.º 45
0
 def html_pdf_output(self):
     html = "\n".join(self.get_html_outputs())
     pdf = pipe(["wkhtmltopdf", "-", "-"],
                html.encode('ascii', 'xmlcharrefreplace'))
     return pdf
Exemplo n.º 46
0
parsed_url = urlparse(url)


cookies = CookieJar()
useragent = 'newspaper/0.2.8'
headers = {'User-Agent': useragent}
timeout = 7
response = requests.get(url=url, **{'headers': headers, 'cookies': cookies, 'timeout': timeout, 'allow_redirects': True})
html = response.text

doc = lxml.html.fromstring(html)
clean_doc = copy.deepcopy(doc)


raw_html = html.encode('utf-8', 'replace')
link_hash = '%s.%s' % (hashlib.md5(raw_html).hexdigest(), time.time())

title = ''

tag= 'title'
selector = 'descendant-or-self::%s' % (tag or '*')

elems = clean_doc.xpath(selector, namespaces=None)

txts = [i for i in elems[0].itertext()]
TABSSPACE = re.compile(r'[\s\t]+')
value = ' '.join(txts).strip()
value = re.sub(TABSSPACE, ' ', value)
value = ''.join(value.splitlines())
Exemplo n.º 47
0
def parse_speeches(i, url, html, response_status):
    record = {}
    record['old_url'] = url
    record['i'] = i
    if response_status == 404:  #don't bother with the 404's
        record['status'] = 404
        return record
    root = lxml.html.fromstring(html.encode('iso-8859-1'))
    record['summary'] = root.cssselect('meta[name="DC.description"]')[0].get(
        'content')  #summary from metatag
    contentdiv = root.cssselect("div#content")
    if not contentdiv:
        return False
    content = contentdiv[0]
    if not content:
        return False
    titles = list(content.cssselect("div.hgroup h1"))
    if titles:
        record['title'] = titles[0].text_content().strip()
    meta = content.cssselect("table.meta")[0]
    try:
        record['associated_organisations'] = meta.xpath(
            '//*[contains(text(), "Publisher:")]/following-sibling::*'
        )[0].text_content()  #associate organisations
    except:
        pass
    try:
        record['delivered_by'] = meta.xpath(
            '//*[contains(text(), "Delivered by:")]/following-sibling::*'
        )[0].text_content()  #delivered by
    except:
        pass
    try:
        record['delivered_on_date'] = meta.xpath(
            '//*[contains(text(), "Delivered date:")]/following-sibling::*'
        )[0].text_content().strip()  # delivered on date
    except:
        pass
    try:
        record['speech_type'] = meta.xpath(
            '//*[contains(text(), "Type:")]/following-sibling::*'
        )[0].text_content().strip()  # speech type
    except:
        pass
    try:
        record['event'] = meta.xpath(
            '//*[contains(text(), "Event:")]/following-sibling::*'
        )[0].text_content().strip()  # event
    except:
        pass
    try:
        record['location'] = meta.xpath(
            '//*[contains(text(), "Location:")]/following-sibling::*'
        )[0].text_content().strip()  # location
    except:
        pass
    if 'event' in record and 'location' in record:
        record['event_and_location'] = record['event'] + ', ' + record[
            'location']  # event + location
    try:
        record['date'] = dateutil.parser.parse(
            record["delivered_on_date"],
            dayfirst=True).date().isoformat()  #iso date
    except:
        pass
    try:
        record['associated_policies'] = meta.xpath(
            '//*[contains(text(), "Mode/topic:")]/following-sibling::*'
        )[0].text_content()  #associate policies
    except:
        pass
    for node in content.cssselect(
            "div.header"
    ):  #drop the header info - we are done with it and don't want it in the body text
        node.drop_tree()
    record['body'] = html2text.HTML2Text().handle(
        data=lxml.html.tostring(content))  #bodytext
    record['body'] = record['body'].replace(u"\xa0",
                                            u" ")  #non breaking spaces
    record['body'] = record['body'].encode('utf-8')
    return record
Exemplo n.º 48
0
def write_to_file(html, output_file):
    with open(output_file, "wb") as fh:
        fh.write(html.encode('utf-8'))
Exemplo n.º 49
0
def parse_news(i, url, html, response_status):
    record = {}
    record['old_url'] = url
    record['i'] = i
    if response_status == 404:  #don't bother with the 404's
        record['status'] = 404
        return record
    root = lxml.html.fromstring(html.encode('iso-8859-1'))
    record['summary'] = root.cssselect('meta[name="DC.description"]')[0].get(
        'content')  #summary from metatag
    contentdiv = root.cssselect("div#content")
    if not contentdiv:
        return False
    content = contentdiv[0]
    if not content:
        return False
    titles = list(content.cssselect("div.hgroup h1"))
    if titles:
        record['title'] = titles[0].text_content().strip()  #stripped title
    meta = content.cssselect("table.meta")[0]
    try:
        record['associated_organisations'] = meta.xpath(
            '//*[contains(text(), "Publisher:")]/following-sibling::*'
        )[0].text_content()  #associate organisations
    except:
        pass
    try:
        record['associated_policies'] = meta.xpath(
            '//*[contains(text(), "Mode/topic:")]/following-sibling::*'
        )[0].text_content()  #associate policies
    except:
        pass
    try:
        record['first_published'] = meta.xpath(
            '//*[contains(text(), "Published date:")]/following-sibling::*'
        )[0].text_content().strip()  # first published date
    except:
        pass
    try:
        record['type'] = meta.xpath(
            '//*[contains(text(), "Type:")]/following-sibling::*'
        )[0].text_content().strip()  # type
    except:
        pass
    try:
        record['date'] = dateutil.parser.parse(
            record["first_published"],
            dayfirst=True).date().isoformat()  #iso date
    except:
        pass

    for node in content.cssselect(
            "div.header"
    ):  #drop the header info - we are done with it and don't want it in the body text
        node.drop_tree()
    #record['body'] = html2text.HTML2Text().handle(data=lxml.html.tostring(content,encoding=unicode)) #bodytext

    #record['body'] = html2text.HTML2Text().handle(data=lxml.html.tostring(content,encoding="ascii")) #bodytext

    #encoding mess:
    record['body'] = html2text.HTML2Text().handle(
        data=lxml.html.tostring(content))  #bodytext
    record['body'] = record['body'].replace(u"\xa0",
                                            u" ")  #non breaking spaces
    record['body'] = record['body'].encode('utf-8')
    #print lxml.html.tostring(content,encoding="ascii")
    #print repr(record['body'])

    return record
Exemplo n.º 50
0
 def browserview(self, html):
     tf = tempfile.NamedTemporaryFile(delete=False)
     tf.write(html.encode())
     webbrowser.open(tf.name)
Exemplo n.º 51
0
def parse_srs(i, url, html, response_status):
    record = {}
    record['old_url'] = url
    record['i'] = i
    if response_status == 404:  #don't bother with the 404's
        record['status'] = 404
        return record
    root = lxml.html.fromstring(html.encode('iso-8859-1'))
    record['summary'] = root.cssselect('meta[name="DC.description"]')[0].get(
        'content')  #summary from metatag
    record['type'] = 'release'
    type_img = root.xpath('//img[@alt="National Statistics logo"]')
    if type_img:
        record['type'] = 'National Stats'
    chart = root.xpath('//div[@id="line_chart"]')
    if chart:
        record['chart'] = 'yes'
    contentdiv = root.cssselect("div#content")
    if not contentdiv:
        return False
    content = contentdiv[0]
    if not content:
        return False
    titles = list(content.cssselect("div.hgroup h1"))
    if titles:
        record['title'] = titles[0].text_content().strip()
    meta = content.cssselect("table.meta")[0]
    try:
        record['associated_policies'] = meta.xpath(
            '//*[contains(text(), "Statistics topic:")]/following-sibling::*'
        )[0].text_content()  #associated policies
    except:
        pass
    try:
        record['associated_organisations'] = meta.xpath(
            '//*[contains(text(), "Publisher:")]/following-sibling::*'
        )[0].text_content()  #associate organisations
    except:
        pass
    try:
        record['publication_date'] = meta.xpath(
            '//*[contains(text(), "Published date:")]/following-sibling::*'
        )[0].text_content().replace(u'\xa0', u'')
    except:
        pass
    try:
        record['publication_date_iso'] = dateutil.parser.parse(
            record["publication_date"],
            dayfirst=True).date().isoformat()  #iso date
    except:
        pass
    try:
        record['publication_series'] = meta.xpath(
            '//*[contains(text(), "Series:")]/following-sibling::*'
        )[0].text_content()  #delivered by
    except:
        pass
    print record
    for node in content.cssselect(
            "div.header"
    ):  #drop the header info - we are done with it and don't want it in the body text
        node.drop_tree()

    links = content.cssselect('a')
    if len(
            links
    ) > 50:  #put the page(s) with large number of attachments into a json field
        n = 1
        attachment_json = {}
        for link in links:
            try:
                if 'assets.dft.gov.uk' in link.attrib[
                        'href'] or 'webarchive.nationalarchives.gov.uk' in link.attrib[
                            'href']:
                    attachment_json['attachment_' +
                                    str(n)] = link.attrib['href']
                    attachment_json['attachment_' + str(n) +
                                    '_title'] = link.text_content()
                    n = n + 1
            except:
                pass
        record['manual'] = 1
        record['attachment_json'] = json.dumps(attachment_json)
    else:  #process the attachments
        n = 1
        for link in links:
            try:
                if 'assets.dft.gov.uk' in link.attrib[
                        'href'] or 'webarchive.nationalarchives.gov.uk' in link.attrib[
                            'href']:
                    record['attachment_' + str(n)] = link.attrib['href']
                    record['attachment_' + str(n) +
                           '_title'] = link.text_content()
                    n = n + 1
            except:
                pass

    record['body'] = html2text.HTML2Text().handle(
        data=lxml.html.tostring(content))  #bodytext
    record['body'] = record['body'].replace(u"\xa0",
                                            u" ")  #non breaking spaces
    record['body'] = record['body'].encode('utf-8')
    print record['body']
    return record
Exemplo n.º 52
0
def parse_sts(i, url, html, response_status):
    record = {}
    record['old_url'] = url
    record['i'] = i
    if response_status == 404:  #don't bother with the 404's
        record['status'] = 404
        return record
    root = lxml.html.fromstring(html.encode('iso-8859-1'))
    record['summary'] = root.cssselect('meta[name="DC.description"]')[0].get(
        'content')  #summary from metatag

    contentdiv = root.cssselect("div#content")
    if not contentdiv:
        return False
    content = contentdiv[0]
    if not content:
        return False
    titles = list(content.cssselect("div.hgroup h1"))
    if titles:
        record['title'] = titles[0].text_content().strip()
    meta = content.cssselect("table.meta")[0]
    try:
        record['geo_scope'] = meta.xpath(
            '//*[contains(text(), "Geographical scope:")]/following-sibling::*'
        )[0].text_content()
    except:
        pass
    try:
        record['geo_breakdown'] = meta.xpath(
            '//*[contains(text(), "Geographical breakdown:")]/following-sibling::*'
        )[0].text_content()
    except:
        pass
    try:
        record['urn'] = meta.xpath(
            '//*[contains(text(), "Reference:")]/following-sibling::*'
        )[0].text_content().replace(u'\xa0', u'')  #delivered by
    except:
        pass
    try:
        record['associated_policies'] = meta.xpath(
            '//*[contains(text(), "Statistics topic:")]/following-sibling::*'
        )[0].text_content()  #associated policies
    except:
        pass
    try:
        record['associated_organisations'] = meta.xpath(
            '//*[contains(text(), "Publisher:")]/following-sibling::*'
        )[0].text_content()  #associate organisations
    except:
        pass
    try:
        record['publication_date'] = meta.xpath(
            '//*[contains(text(), "Published date:")]/following-sibling::*'
        )[0].text_content().replace(u'\xa0', u'')
    except:
        pass
    try:
        record['publication_date_iso'] = dateutil.parser.parse(
            record["publication_date"],
            dayfirst=True).date().isoformat()  #iso date
    except:
        pass
    try:
        record['publication_series'] = meta.xpath(
            '//*[contains(text(), "Series:")]/following-sibling::*'
        )[0].text_content()  #delivered by
    except:
        pass
    try:
        record['type'] = meta.xpath(
            '//*[contains(text(), "Type:")]/following-sibling::*'
        )[0].text_content()  #delivered by
    except:
        pass
    for node in content.cssselect(
            "div.header"
    ):  #drop the header info - we are done with it and don't want it in the body text
        node.drop_tree()
    for node in content.xpath('div[@id="secondary"]'):  #drop the secondary
        node.drop_tree()

    #links = content.xpath('//li/a/.')
    #linksul = content.xpath('//h2[text()="Download table"]//following-sibling::*')
    #for link in linksul:
    #    print link.tag
    links = root.xpath('//div[@id="content"]//li/a/.')

    n = 1
    for link in links:

        try:
            if 'assets.dft.gov.uk' in link.attrib[
                    'href'] or 'webarchive.nationalarchives.gov.uk' in link.attrib[
                        'href']:
                record['attachment_' + str(n)] = link.attrib['href']
                record['attachment_' + str(n) + '_title'] = link.text_content()
                n = n + 1
        except:
            pass

    record['body'] = html2text.HTML2Text().handle(
        data=lxml.html.tostring(content))  #bodytext
    record['body'] = record['body'].replace(u"\xa0",
                                            u" ")  #non breaking spaces
    record['body'] = record['body'].encode('utf-8')
    print record
    return record
Exemplo n.º 53
0
def scrape(crno):
    crnostr = "%07d" % crno
    baseurl = "https://www.mobile-cr.gov.hk/mob/cps_criteria.do?queryCRNO="
    url = baseurl + crnostr

    print "trying local", crnostr
    html = load_local(url)
    if html is None:
        print "trying site", crnostr
        html = scraperwiki.scrape(url).decode('utf-8')
        print "storing local", crnostr
        store_local(url, html.encode('utf-8'))
    else:
        html = html.decode('utf-8')

    if '沒有紀錄與輸入的查詢資料相符' in html.encode('utf-8'):
        print 'NO MATCHING RECORD FOUND FOR THE SEARCH INFORMATION INPUT!'
        return nil
    root = lxml.html.fromstring(html)  # , encoding="utf-8")
    tds = root.cssselect("tr td tr td")
    namestds = root.cssselect("td.data")

    while tds == []:
        print "trying", crnostr, "again"
        sleep(46)
        html = scraperwiki.scrape(baseurl + crnostr).decode('utf-8')
        root = lxml.html.fromstring(html)  # , encoding="utf-8")
        tds = root.cssselect("tr td tr td")
        namestds = root.cssselect("td.data")

        #for idx, val in enumerate(tds):
        #    print idx, ":", val.text_content().encode('utf-8')
    names = {}
    for nameidx, nameval in enumerate(namestds):
        names["Name" + str(nameidx)] = nameval.text_content()[10:]
        names["Name" + str(nameidx) + "date"] = nameval.text_content()[:10]

    print "got", tds[1].text_content()

    data = {
        'cr':
        tds[1].text_content(),
        'English Company Name':
        tds[2].text_content().rsplit('\r')[1].lstrip('\n\t'),
        'Chinese Company Name':
        tds[2].text_content().rpartition('\r')[2].lstrip('\r\n\t'),
        'Company Type':
        tds[4].text_content()[:-1],
        'Date of incorporation':
        tds[6].text_content(),
        # 'Company status' : tds[8].text_content()[:-1],
        'Active status':
        tds[8].text_content()[:-1],
        'Remarks':
        tds[9].text_content().replace(u"備註:", ""),
        'Winding up mode':
        tds[11].text_content()[:-1],
        'Date of Dissolution':
        tds[13].text_content(),
        'Register of Charges':
        tds[15].text_content()[:-1],
        'Important Note':
        tds[16].text_content().replace(u"重要事項:", "").lstrip('\r\n\t')
    }
    data.update(names)

    db['swdata'].upsert(data, ['cr'])
    print "wrote", tds[1].text_content()
Exemplo n.º 54
0
def _cleaned_html_tree(html: str) -> HtmlElement:
    parser = HTMLParser(encoding='utf8')
    tree = fromstring(html.encode('utf8'), parser=parser)
    return _clean_html(tree)
def scrape_new_html(limit=20, url_comment_id=dict(), test_url=None):
    theta_conn = db_connections.get_theta_postgres_db()
    theta_cur = theta_conn.cursor()
    theta_cur.execute('set search_path = "backend"')
    if test_url is not None:
        theta_cur.execute(
            "select loc, html from html where loc = '{}';".format(test_url))
    else:
        theta_cur.execute("""
            SELECT
              html.loc
            , html
            FROM html
              JOIN sitemap
                ON html.loc = sitemap.loc
            WHERE (last_scrape IS NULL OR lastmod > last_scrape)
                  AND html IS NOT NULL
                  --AND NOT ('fundraisers' = ANY (categories))
                  AND NOT ('static' = ALL (categories) OR html.loc = 'https://www.crowdrise.com')
            limit {};""".format(limit))
    html_data = theta_cur.fetchall()
    if len(html_data) == 0:
        theta_cur.close()
        theta_conn.close()
        return True
    all_data = dict(fundraiser=[],
                    user=[],
                    charity=[],
                    event=[],
                    special_user=[],
                    front_page_redirect=[],
                    user_project=[],
                    charity_event=[],
                    team=[],
                    donation=[])
    scraped_urls = []
    for url, html in html_data:
        scraped_urls.append(url)
        try:
            # root = lxml.html.fromstring(lxml.html.tostring(lxml.html.fromstring(html.encode('latin1'))).decode('utf8'))
            try:
                root = lxml.html.fromstring(
                    html.encode('latin1').decode('utf8'))
            except UnicodeDecodeError:
                logging.warning(
                    'unicode decode error for url "{}"'.format(url))
                theta_conn, theta_cur = keep_theta_conn_alive(
                    theta_conn, theta_cur)
                theta_cur.execute(
                    'insert into html_bad_encoding values (%s) on CONFLICT DO NOTHING ;',
                    [(url, )])
                theta_conn.commit()
                root = lxml.html.fromstring(
                    html.encode('latin1').decode('utf8', errors='ignore'))
            try:
                page_type = CrowdriseScraper.get_page_type(root)
            except NotImplementedError:
                theta_conn, theta_cur = keep_theta_conn_alive(
                    theta_conn, theta_cur)
                theta_cur.executemany(
                    "insert into unknown_page_type values (%s) on CONFLICT DO NOTHING;",
                    [(url, )])
                theta_conn.commit()
                continue
            page_data = CrowdriseScraper.get_crowdrise_data(
                page_type,
                root,
                url,
                latest_comment_id=url_comment_id.get(url))
            if page_data is not None:
                # file_data['file_path'] = cur_file_name
                page_data['url'] = url
                page_data['true_url'] = root.xpath(
                    '//meta[@property="og:url"]')[0].attrib['content'].replace(
                        'https://', '').replace('http://', '')
                page_data['base_true_url'] = None

                # file_data['last_scrape'] = time.gmtime(os.path.getmtime(cur_file_name))

                # handle data that requires its own table - eg the fundraisers each user has
                if 'projects' in page_data.keys():
                    projects = page_data.pop('projects')
                    all_data['user_project'] += [{
                        'username':
                        page_data['username'],
                        'project':
                        'www.crowdrise.com' + x
                    } for x in projects]
                if 'events' in page_data.keys():
                    events = page_data.pop('events')
                    all_data['charity_event'] += [{
                        'charity':
                        page_data['url'],
                        'event':
                        'www.crowdrise.com' + x
                    } for x in events]
                if 'team_members' in page_data.keys():
                    team_members = page_data.pop('team_members')
                    all_data['team'] += team_members

                if 'donations' in page_data.keys():
                    donations = page_data.pop('donations')
                    all_data['donation'] += donations

                all_data[page_type].append(page_data)
        except:
            print('failed on url "{}"'.format(url))
            logging.error('failed on url "{}"'.format(url))
            raise
    all_data['user_project'] = [
        x for x in all_data['user_project']
        if re.match(CROWDRISE_URL_RE, 'https://' + x['project'])
    ]
    db = db_connections.get_fungrosencrantz_schema('crowdrise')
    db_connections.multi_table_upload(data=all_data,
                                      db=db,
                                      ensure=True,
                                      process_num=None,
                                      chunk_size=3000)
    scrape_time = time.time()

    # update table with new entries
    db.query('truncate table _recently_updated')
    db.executable.execute(
        'insert ignore into _recently_updated values (%s, %s)',
        [(x, scrape_time) for x in scraped_urls])
    db.executable.execute("""
        replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type)
        SELECT
          fundraiser.url,
          CASE WHEN fundraiser_url IS NULL # individual fundraiser
            THEN fundraiser.username
          ELSE # team fundraiser
            '' # give team total raised for fundraiser, then use `team` to give individual contributions
          END,
          coalesce(team_total_raised, total_raised),
          NULL,
          _recently_updated.last_scrape_unix,
          'fundraiser'
        FROM fundraiser
          join _recently_updated on _recently_updated.url = fundraiser.url
          LEFT JOIN team ON fundraiser.url = team.fundraiser_url
        GROUP BY fundraiser.url;
        
        replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type)
        select fundraiser_url, username, amount_raised, goal, _recently_updated.last_scrape_unix, 'team' from team
        join _recently_updated on _recently_updated.url = team.fundraiser_url;
        
        replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type)
        select charity.url, '', money_raised, null, _recently_updated.last_scrape_unix, 'charity' from charity
        join _recently_updated on _recently_updated.url = charity.url;
        
        replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type)
        select event.url, '', amount_raised, goal, _recently_updated.last_scrape_unix, 'event' from event
        join _recently_updated on _recently_updated.url = event.url;
        
        replace into crowdrise.funding_trend (url, username, amount_raised, goal, scrape_time_unix, type)
        select user.url, username, money_raised, null, _recently_updated.last_scrape_unix, 'user' from user
        join _recently_updated on _recently_updated.url = user.url;
        """)

    q = """
    update html
    set last_scrape = to_timestamp({})
    where loc in ({});""".format(
        scrape_time, ", ".join(["'" + x + "'" for x in scraped_urls]))
    theta_conn, theta_cur = keep_theta_conn_alive(theta_conn, theta_cur)
    theta_cur.execute(q)

    if test_url is None and limit != 0:
        theta_conn.commit()

    theta_cur.close()
    theta_conn.close()
    if len(html_data) < limit or test_url is not None:
        return False
    else:
        return True
Exemplo n.º 56
0
def prepare_mongodoc(doc):
    html = doc.get('html_rendered', doc.get('html')) or ''
    return prepare_html(html.encode('utf8'))
import scraperwiki
import requests
import lxml.html
import lxml.etree
from lxml.cssselect import CSSSelector
import time
from datetime import datetime

url = "http://www.wetteronline.de/Berlin/Berlin.htm"
html = requests.get(url, verify=False).text

root = lxml.html.fromstring(html.encode("utf-8"))
time = datetime.now()

t_max_heute = root.cssselect("td.tmax")[0].text_content()
t_max_morgen = root.cssselect("td.tmax")[1].text_content()
t_max_ubermorgen = root.cssselect("td.tmax")[2].text_content()

t_min_heute = root.cssselect("td.tmin")[0].text_content()
t_min_morgen = root.cssselect("td.tmin")[1].text_content()
t_min_ubermorgen = root.cssselect("td.tmin")[1].text_content()

data = {
    'time': time,
    't_max_heute': t_max_heute,
    't_max_morgen': t_max_morgen,
    't_max_ubermorgen': t_max_ubermorgen,
    't_min_heute': t_min_heute,
    't_min_morgen': t_min_morgen,
    't_min_ubermorgen': t_min_ubermorgen
}
Exemplo n.º 58
0
def prepare_doc(doc):
    html = doc.get('html_rendered', doc['html'])
    return prepare_htmltext(html.encode('utf8'))
Exemplo n.º 59
0
 def browserview(self, html):
     tf = tempfile.NamedTemporaryFile(delete=False)
     tf.write(html.encode())
     webbrowser.open(tf.name)