Пример #1
0
  def scrape_nyt(self):
    urls = scrapemark.scrape("""
          <body>    
              {*
                <div class='element2'>
                  <h3> <a href='{{ [links].url }}'></a> </h3>
                </div>
              *}
          </body>
        """,
        url='http://www.nytimes.com/most-popular-emailed')['links']

    urls += scrapemark.scrape("""
          <body>    
              {*
                <div class='element2'>
                  <h3> <a href='{{ [links] }}'></a> </h3>
                </div>
              *}
          </body>
        """,
        url='http://www.nytimes.com/most-popular-viewed')['links']

    urls += scrapemark.scrape("""
          <body>    
              {*
                <div class='element2'>
                  <h3> <a href='{{ [links] }}'></a> </h3>
                </div>
              *}
          </body>
        """,
        url='http://www.nytimes.com/most-popular-blogged')['links']

    return urls
def main():
    links = scrape(PATTERN, url=URL)
    print links
    #done= set([res['nzgls_identifier'] for res in sw.sqlite.select('nzgls_identifier FROM bills')])
    #print done
    for link in links['sources']:
        bills = scrape(PATTERN2, url=link)['bills']
        print bills
        for bill in bills:
            print bill
            try:
                bill = scrape(INDIVIDUAL_BILL, url=bill)
            except Exception, e:
                print "DEBUG: %s" % e
                continue
            bill['link'] = link
            do_details(bill)
            do_meta(bill)
            do_related(bill)
            for related_doc in bill['related']:
                related_doc['nzgls_identifier']=bill['nzgls_identifier']
                related_doc['bill']=bill['title']
            sw.sqlite.save(['link'], data=bill['related'], table_name='related_docs')
            cleanup(bill)
            sw.sqlite.save(['link', 'valid_from'], data=bill, table_name='bills')
def parse_swift_code_page(url, country_name, queue=Q):
    if url in DONE:
        return None
    print 'downloading', country_name
    raw = get_country_html(url)

    banks = scrape(SWIFT_CODE_PATTERN, html=raw)['banks']
    for bank in banks:
        bank['address'] = cleanup_address(bank['address'])
        bank['country_name'] = country_name
        bank['source'] = url
    sqlite.save(['swift_code'], data=banks, table_name='swift_codes')

    if 'page=' not in url:
        try:
            n_pages = max(
                int(link.split('=')[-1])
                for link in scrape(PAGINATION_PATTERN, html=raw))
            pages = [
                BASE_URL +
                '/swift-code/search-swift-complete.php?country=%s&page=%d' %
                (country_name.replace(' ', '%20'), n) for n in xrange(n_pages)
            ]
        except ValueError:  #no more pages
            pages = []
        for newurl in pages:
            queue.push((parse_swift_code_page, newurl, country_name))
    DONE.add(url)
    sqlite.save(['url'], table_name='_done', data=dict(url=url))
Пример #4
0
  def scrape_topsy(self):
    urls = scrapemark.scrape("""
          <body>
            <div class="list">
              {*
                  <h3 class="title">
                  <a href='{{ [links].url }}'></a>
                  </h3>
              *}
            </div>
          </body>
        """,
        url='http://topsy.com/top100')['links']

    for page, offset in enumerate([15,30,45,60,75,90,105,120,135]):
      urls += scrapemark.scrape("""
          <body>
            <div class="list">
              {*
                  <h3 class="title">
                  <a href='{{ [links].url }}'></a>
                  </h3>
              *}
            </div>
          </body>
        """,
        url='http://topsy.com/top100?offset='+str(offset)+'&om=f&page='+str(page+1)+'&thresh=top100')['links']

    return urls
def extract_construction_profs(details):
    blanket = "<h6>Construction Professionals</h6>{{ pros|html }}" ""
    targetted = """<h6>Construction Professionals</h6>{* <a href="javascript:viewBio('{{ [pros.id] }}');">{{ [pros].name }} </a> *}"""
    pros = scrape(targetted, html=details)['pros']
    if not pros:
        pros = scrape(blanket, html=details)['pros']
    return 'pros', pros
def extract_construction_profs(details):
    blanket = "<h6>Construction Professionals</h6>{{ pros|html }}"""
    targetted = """<h6>Construction Professionals</h6>{* <a href="javascript:viewBio('{{ [pros.id] }}');">{{ [pros].name }} </a> *}"""
    pros = scrape(targetted, html=details)['pros']
    if not pros:
        pros = scrape(blanket, html=details)['pros']
    return 'pros', pros
Пример #7
0
def corporation_registration(page):
    pattern = """Corporation:{{ name }}Name change history
Responsible Officer:{{ responsible_officer_name }}
Position Title:    {{ responsible_officer_name }}
Version:{{ registration_id }}
Type:{{ registration_type }}
Active from:{{ registration_active_from_date }}
Activity last confirmed:{{ registration_last_confirmed_date }}

A. Information about Responsible Officer and Corporation
Corporation:{{ corporation_name }}
Telephone number:{{ corporation_phone }}
Fax number:{{ corporation_fax }}
Description of the corporation's business activities: {{ corporation_business_activities }}
 
Parent:{{ parent|html }}
Subsidiary:{{ subsidiary|html }}
Was the corporation funded in whole or in part by any domestic or foreign government institution in the last completed financial year, or does the client expect funding in the current financial year?{{ is_government_funded }}

B. Lobbyists Employed by the Corporation
List of Senior Officers whose lobbying activities represent less than 20% of their Duties
{*
Name:{{ [lobbyists].name }}
Position title:{{ [lobbyists].title }}
Public offices held:{{ [lobbyists].public_offices_held }}
Designated public office holder:{{ [lobbyists].is_public_officer }}Name
*}{*
Name:{{ [lobbyists].name }}
Position title:{{ [lobbyists].title }}
Public offices held:{{ [lobbyists].public_offices_held }}
Designated public office holder:{{ [lobbyists].is_public_officer }}
*}

C. Lobbying Activity Information
Federal departments or organizations which have been or will be communicated with during the course of the undertaking: {{ agencies_talked_to }}
Communication techniques that have been used or are expected to be used in the course of the undertaking: 
{{ lobbying_activities }}
Information about Subject matter:{{ lobbying_subject_matter }}
 
Details Regarding the Identified Subject Matter
"""
    subject_matter_pattern = """Details Regarding the Identified Subject Matter
{* <tr><td>{{ [topics].category }}</td><td>{{ [topics].description }}</td></tr> *} 
"""
    page = GET(url)
    registration = scrape(pattern,
                          html=html.tostring(html.fromstring(page),
                                             encoding='utf-8',
                                             method='text'))
    registration['lobbyists'] = [
        l for l in registration['lobbyists']
        if len(l['is_public_officer'].split()) == 1
    ]
    registration['topics'] = scrape(subject_matter_pattern, html=page)
    registration['parent'] = registration['parent'].strip()
    registration['parent_name'] = registration['parent'].split('\n')[0]
    registration['subsidiary'] = registration['subsidiary'].strip()
    registration['subsidiary_name'] = registration['subsidiary'].split('\n')[0]
Пример #8
0
def main():
    movements = scrape(MOVEMENTS_INDEX, html=get_page(URL))
    print movements
    for m in movements['movements']:
        if 'artcyclopedia.com' in m['link']:
            movement = scrape(MOVEMENTS_INDIVIDUAL, html=get_page(m['link']))
            print m['title']
            if not movement:
                movement = scrape(MOVEMENTS_INDIVIDUAL2,
                                  html=get_page(m['link']))

            relations = []
            for relation in movement['related']:
                r = dict(movement=m['title'], related_to=relation['topic'])
                if '/artists/' in relation['link']:
                    r['topic'] = 'artist'
                else:
                    r['topic'] = 'movement'
                relations.append(r)

            artists = []
            for artist in movement['artists']:
                artist['movement'] = m['title']
                dates = artist['alive'].split('-')
                try:
                    artist['birth_year'] = int(dates[0])
                    artist['death_year'] = int(dates[1])
                except ValueError:
                    if 'Born' in dates:
                        artist['birth_year'] = int(dates.split()[1])
                        artist['death_year'] = None
                except:
                    print >> sys.stderr, "ERROR: Can't parse dates for %s: %s" % (
                        artist['name'], artist['alive'])
                    artist['birth_year'] = None
                    artist['death_year'] = None
                artist['profile_link'] = URL + artist['profile_link'][3:]
                try:
                    artist['nationality'], artist['profession'] = artist[
                        'artist_type'].split(' ', 1)
                except ValueError:
                    artist['nationality'] = artist['artist_type']
                    artist['profession'] = 'unknown'

                artists.append(artist)
            datastore.save(['name'],
                           table_name="movements",
                           data=dict(name=m['title'], link=m['link']))
            datastore.save(['movement', 'related_to'],
                           table_name="relations",
                           data=relations)
            datastore.save(['name', 'nationality'],
                           table_name="artists",
                           data=artists)
def corporation_registration(page):
    pattern = """Corporation:{{ name }}Name change history
Responsible Officer:{{ responsible_officer_name }}
Position Title:    {{ responsible_officer_name }}
Version:{{ registration_id }}
Type:{{ registration_type }}
Active from:{{ registration_active_from_date }}
Activity last confirmed:{{ registration_last_confirmed_date }}

A. Information about Responsible Officer and Corporation
Corporation:{{ corporation_name }}
Telephone number:{{ corporation_phone }}
Fax number:{{ corporation_fax }}
Description of the corporation's business activities: {{ corporation_business_activities }}
 
Parent:{{ parent|html }}
Subsidiary:{{ subsidiary|html }}
Was the corporation funded in whole or in part by any domestic or foreign government institution in the last completed financial year, or does the client expect funding in the current financial year?{{ is_government_funded }}

B. Lobbyists Employed by the Corporation
List of Senior Officers whose lobbying activities represent less than 20% of their Duties
{*
Name:{{ [lobbyists].name }}
Position title:{{ [lobbyists].title }}
Public offices held:{{ [lobbyists].public_offices_held }}
Designated public office holder:{{ [lobbyists].is_public_officer }}Name
*}{*
Name:{{ [lobbyists].name }}
Position title:{{ [lobbyists].title }}
Public offices held:{{ [lobbyists].public_offices_held }}
Designated public office holder:{{ [lobbyists].is_public_officer }}
*}

C. Lobbying Activity Information
Federal departments or organizations which have been or will be communicated with during the course of the undertaking: {{ agencies_talked_to }}
Communication techniques that have been used or are expected to be used in the course of the undertaking: 
{{ lobbying_activities }}
Information about Subject matter:{{ lobbying_subject_matter }}
 
Details Regarding the Identified Subject Matter
"""
    subject_matter_pattern = """Details Regarding the Identified Subject Matter
{* <tr><td>{{ [topics].category }}</td><td>{{ [topics].description }}</td></tr> *} 
"""
    page = GET(url)
    registration = scrape(pattern, html=html.tostring(html.fromstring(page), encoding='utf-8', method='text'))
    registration['lobbyists'] = [l for l in registration['lobbyists'] if len(l['is_public_officer'].split()) == 1]
    registration['topics'] = scrape(subject_matter_pattern, html=page)
    registration['parent'] = registration['parent'].strip()
    registration['parent_name'] = registration['parent'].split('\n')[0]
    registration['subsidiary'] = registration['subsidiary'].strip()
    registration['subsidiary_name'] = registration['subsidiary'].split('\n')[0]
def iter_mailing_list_quake_refs():
    base = "http://lists.geonet.org.nz/pipermail/eqnews/"
    index_urls = scrape(PIPERMAIL_INDEX_PATTERN, url=base)
    index_urls.reverse()
    index_urls= index_urls[96:]  ### REMEMBER TO DELETE WHEN FIRST COMPELETE RUN WORKS
    print index_urls
    for month in index_urls:
        print month
        messages = scrape(PIPERMAIL_MONTH_PATTERN, url = base + month)
        messages = [base + month.replace('date.html', link) for link in messages if '0' in link]
        print messages
        for message in messages:
            yield scrape(PIPERMAIL_MESSAGE_PATTERN, url=message)
Пример #11
0
	def get(self):
		# page ALEC_Corporations
		html = urllib2.urlopen("http://www.sourcewatch.org/index.php?title=ALEC_Corporations").read()
		
		# get for-profit corporation citations
		references = scrape("""<ol class="references"> {* <li> {{ []|html }} </li> *} </ol>""",html)
		self._add_citations(references,'ALEC_Corporations')

		# get for-profit corporations
		letters = ['A','B','C','D','E','F','G','H','I','J','K','L','M','N','O','P','Q','R','S','T','U','V','W','X','Y','Z']
		corporations = []
		for letter in letters:
			corps = scrape("""<h3> <span class="mw-headline">"""+letter+""" </span> </h3> <ul> {* <li> {{ []|html }} </li> *} </ul> """,html)
			corpdata = []
			for each in corps:
				refs = scrape(""" {* <sup class="reference">[ {{ []|int }} ]</sup> *}""",each)
				datas = scrape(""" {* {{ [] }} <sup class="reference"> </sup> *} """,each)
				name = ''
				if len(datas) > 0:
					name = datas[0]

				info = ''
				if len(datas) > 2:
					for d in datas[0:]:
						info = info+d
				elif len(datas) > 1:
					info = datas[1]
				corpdata.append((name,info,refs))
			corporations.extend(corpdata)
		self._add_corporations(corporations,1)
		
		# page ALEC_Non-Profits
		html2 = urllib2.urlopen("http://www.sourcewatch.org/index.php?title=ALEC_Non-Profits").read()
		
		# get non-profit corporation citations
		references = scrape("""<ol class="references"> {* <li> {{ []|html }} </li> *} </ol>""",html2)
		self._add_citations(npcitations,'ALEC_Non-Profits')

		# get non-profit corporations
		nonprofits = []
		for letter in letters:
			np = scrape("""<h3> <span class="mw-headline">"""+letter+"""</span> </h3> <ul> {* <li> {{ []|html }} </li> *} </ul> """,html2)
			npdata = []
			for each in np:
				refs = scrape(""" {* <sup class="reference">[ {{ []|int }} ]</sup> *}""",each)
				datas = scrape(""" {* {{ [] }} <sup class="reference"> </sup> *} """,each)
				name = ''
				if len(datas) > 0:
					name = datas[0]
				info = ''
				if len(datas) > 2:
					for d in datas[0:]:
						info = info+d
				elif len(datas) > 1:
					info = datas[1]
				npdata.append((name,info,refs))
			nonprofits.extend(npdata)
		self._add_corporations(nonprofits,0)
def StartUp():

    #go to homepage in order to set session cookie
    start_url = "https://delecorp.delaware.gov/tin/GINameSearch.jsp"
    p = ""
    g = {"x": str(time())}
    html = ""
    uastart = {
        "User-Agent":
        "Mozilla/5.0 (Windows NT 6.1; rv:9.0) Gecko/20100101 Firefox/9.0"
    }
    try:
        html = scrape('''
<html>
{{ [y].html }}
</html>
''',
                      url=start_url,
                      get=g,
                      headers=uastart,
                      cookie_jar=myjar)

    except BadStatusLine:
        #hmmm... will skip this check for now..
        return 0
    except Exception, e:
        debug(repr(e))
        debug("scrape problem")
        return 1
def getbdschools(url):
    return scrapemark.scrape("""
            {*
                  <table>
                        <tr></tr>
                        <tr></tr>
                        {*
                           <tr>
                               <td><a href='{@{*<div class="contactinfonew"></div><div id="heading3"></div>
                                      <div class="addressbar contactinfonew">
                                      <div><div>{{[saddress]}}</div>
                                                <div>tel:{{[sphone]}}</div>
                                      </div>
                                      <div>
                                                <div><a></a><a href={{[sweb]}} target="new"></a></div>
                                                <div><a></a></div>
                                      </div>
                                      <div><a></a><a></a></div>
                                      <div><br>{{[sbrief]}}</div>
                                </div>*} @}'>{{[sname]}}</a></td>
                               <td><div>{{[stype]}}</div></td>
                               <td><div>{{[sgrade]}}</div></td>
                               <td><div>{{[scity]}}</div></td>
                           </tr>
                        *}
               </table>
            *}
            """,
                             url=url)
Пример #14
0
def get_session_attendants(id):
	"""
		Get list of people who have attended a session
	"""
	global db
	url = BASEURL + 'to0045.asp?__ctext=0&__ksinr=' + str(id)
	print "Lade Anwesenheitsliste", url
	html = urllib2.urlopen(url).read()
	data = scrape("""
	{*
		<tr>
			<td><a href="kp0050.asp?__kpenr={{ [attendee].id|int }}&amp;grnr={{ [attendee].grnr|int }}">{{ [attendee].name }}</a></td>
			<td>{{ [attendee].organization }}</td>
			<td>{{ [attendee].function }}</td>
		</tr>
	*}
	""", html)
	persons = []
	attendants = []
	for row in data['attendee']:
		persons.append({
			'person_id': row['id'],
			'person_name': row['name'],
			'person_organization': row['organization']
		})
		attendants.append({
			'session_id': id,
			'person_id': row['id'],
			'attendance_function': row['function']
		})
	db.save_rows('people', persons, ['person_id'])
	db.save_rows('attendance', attendants, ['session_id', 'person_id'])
Пример #15
0
def get_session_attendants(id):
    """
    Scrapet die Liste der (eingeladenen) Teilnehmer einer Sitzung
    """
    global db
    url = config.BASEURL + (config.URI_ATTENDANTS % id)
    print "Lade Anwesenheitsliste", url
    html = urllib2.urlopen(url).read()
    data = scrape("""
    {*
        <tr>
            <td><a href="kp0050.asp?__kpenr={{ [attendee].id|int }}&amp;grnr={{ [attendee].grnr|int }}">{{ [attendee].name }}</a></td>
            <td>{{ [attendee].organization }}</td>
            <td>{{ [attendee].function }}</td>
        </tr>
    *}
    """, html)
    persons = []
    attendants = []
    for row in data['attendee']:
        persons.append({
            'person_id': row['id'],
            'person_name': row['name'],
            'person_organization': row['organization']
        })
        attendants.append({
            'session_id': id,
            'person_id': row['id'],
            'attendance_function': row['function']
        })
    if not options.simulate:
        db.save_rows('people', persons, ['person_id'])
        db.save_rows('attendance', attendants, ['session_id', 'person_id'])
def GetPage(fileid):

    debug("GetPage:fileid: " + str(fileid))

    #search for a known company:
    params = {
        "JSPName": "GINAMESEARCH",
        "action": "Search",
        "frmFileNumber": fileid,
        "frmEntityName": ""
    }
    html = ""
    try:
        html = scrape('''
<html>
{{ [y].html }}
</html>
''',
                      url=base_url,
                      post=params,
                      headers=ua,
                      cookie_jar=myjar)

    except Exception, e:
        debug(repr(e))
        debug("scrape problem")
        return 1
Пример #17
0
def GetPage(fileid):

    try:
        terms = (scrape("""
            {*
    <h2>Full Details</h2>  </div>  <div class='page_summary_3col'></div>  <div class='page_content_3col'><table width='60%'><tr><td colspan='2' class='line'><font size='2'><b>English</b></font></td></tr><tr><td class='line'><font size='2'>Term</font></td><td class='line'><font size='2'>{{ [y].en_term }}</font></td></tr><tr><td class='line'><font size='2'>Definition</font></td><td class='line'><font size='2'>{{ [y].en_definition }}</font></td></tr><tr><td class='line'><font size='2'>Context</font></td><td class='line'><font size='2'>{{ [y].en_context }}</font></td></tr></table><br><table width='60%'><tr><td colspan='2' class='line'><font size='2'><b>Welsh</b></font></td></tr><tr><td class='line'><font size='2'>Term</font></td><td class='line'><font size='2'>{{ [y].cy_term }}</font></td></tr><tr><td class='line'><font size='2'>Definition</font></td><td class='line'><font size='2'>{{ [y].cy_definition }}</font></td></tr><tr><td class='line'><font size='2'>Status</font></td><td class='line'><font size='2'>{{ [y].cy_status }}</font></td></tr><tr><td class='line'><font size='2'>Part of Speech</font></td><td class='line'><font size='2'>{{ [y].cy_part_of_speech }}</font></td></tr><tr><td class='line'><font size='2'>Gender</font></td><td class='line'><font size='2'>{{ [y].cy_gender }}</font></td></tr><tr><td class='line'><font size='2'>Number</font></td><td class='line'><font size='2'>{{ [y].cy_number }}</font></td></tr><tr><td class='line'><font size='2'>Context</font></td><td class='line'><font size='2'>{{ [y].cy_context }}</font></td></tr><tr><td class='line'><font size='2'>Subject :&nbsp;</font></td><td class='line'><font size='2'>{{ [y].cy_subject }}</font></td></tr></table></div></div></div>            
            *}
            """,
                        url=base_url + fileid))

        debug((len(terms['y']), "items found"))
        debug(terms['y'])

        for k in terms['y']:
            k['id'] = fileid
            scraperwiki.sqlite.execute("""
                INSERT OR REPLACE INTO swdata (id, en_term, en_definition, en_context, cy_term, cy_definition, cy_status, cy_part_of_Speech, cy_gender, cy_number, cy_context, cy_subject) values (:id, :en_term, :en_definition, :en_context, :cy_term, :cy_definition, :cy_status, :cy_part_of_speech, :cy_gender, :cy_number, :cy_context, :cy_subject)
            """,
                                       k,
                                       verbose=0)
            scraperwiki.sqlite.commit()
            #scraperwiki.sqlite.save(unique_keys=fileid, data=k, table_name="swdata")
    except Exception, e:
        print e
        return
Пример #18
0
def process():
    url='http://www.tianya.cn/publicforum/content/develop/1/905898.shtml'
    template=Template(u"""
        {*
        <table id="firstAuthor">
		    <tr>
                <td>
                    <a>${author}</a> &nbsp;发表日期:{{ [stanzas].datetime }}
		        </td>
		    </tr>
	    </table>
	    <div id="pContentDiv">
	        <div class="post">
	        {{ [stanzas].content }}
	        </div>
	    </div>
        *}
        {*
        <table>
		    <tr>
                <td>
                    <a>${author}</a> 回复日期:{{ [stanzas].datetime }}
		        </td>
		    </tr>
	    </table>
        <div class="post">
        {{ [stanzas].content }}
        </div>
        *}
    """)
    pattern=template.substitute(author=u'flp713')
    pattern=scrapemark.compile(pattern)
    stanzas=scrapemark.scrape(pattern, url=url, encoding=encoding)['stanzas']
    return stanzas
Пример #19
0
def pagefetch(p_url, debug=False):
    html = urllib2.urlopen(p_url).read()
    results = scrapemark.scrape(
        """{*
              <div id="srp">
              <ul id="results">
               {*
               <li>
                <a><img alt="" src={{[thumbs]}}/> </a>
                <div class="result-info">
                    <h3><a href="speaker.php?{{[links]}}">{{[names]}}</a></h3>
                </div>
               </li>
               *}</ul>
               <p class="pagination">
               <a href="results.php?{{[nxurl]}}">Next</a></p>
              </div>
            *}""", html)
    if debug:
        print "Fetched Names:", len(results['names'])
        print "Fetched Relinks:", len(results['links'])
        print "Current Page:", p_url
        print "Next Page:", results['nxurl']
        return results
    else:
        return results
Пример #20
0
    def post(self):
        logging.debug('ItemHandler.post')
        url = self.request.get('url')

        detail = scrapemark.scrape("""
                        {* <tr><td><font>{{ name }}</font></td></tr>  *}
                        {* <tr><th>Specialty</th><td>{{ specialty }}</td></tr>  *}
                        {* <tr><th>Facility</th><td>{{ facility }}</td></tr>  *}
                        {* <tr><th>Address</th><td>{{ address|html }}</td></tr>  *}
                        {* <tr><th>Phone</th><td>{{ phone }}</td></tr>  *}
                        {* <tr><th>Certification</th><td>{{ certification }}</td></tr>  *}
                        {* <tr><th>Medical School</th><td>{{ school }}</td></tr>  *}
                        {* <tr><th>Residency</th><td>{{ residence }}</td></tr>  *}
                        {* <tr><th>Gender</th><td>{{ gender }}</td></tr>  *}
                        """,
                                   url=url)

        address = detail['address'].replace('<br>', '\n').replace(
            '\t', '').replace('\r', '').replace('\n\n', '\n')
        office = models.Office.getOrCreate(detail['facility'], address,
                                           detail['phone'])

        detail['specialties'] = [
            i.strip() for i in detail['specialty'].split(';')
        ]
        doc = models.Doc(**detail)
        doc.office = office
        doc.put()
def getIO(name, urlz):
    ios = scrapemark.scrape("""
        {*

    <td><font>{{ [io].direction }}</font></td>
    <td><font>{{ [io].ft }}</font></td>
    <td><font>{{ [io].substance }}</font></td>
    <td>{{ [io].value }}</font></td>
    <td>{{ [io].min }}</td>
    <td>{{ [io].max }}</td>
    <td>{{ [io].std }}</td>
    <td><font>{{ [io].unit }}</font></td>
    <td><font>{{ [io].environment }}</font></td>
    <td><font>{{ [io].geo }}</font></td>
    </tr>
        *}
        """,
        url=urlz)
    inventorystr = ""
    for flow in ios['io']:
        if flow['direction'] == "Input" or flow['direction'] == "Output":
            inventorystr = inventorystr + "<eco:hasUnallocatedExchange>";
            inventorystr = inventorystr + '<eco:hasEffect><rdfs:type rdf:resource="eco:' + flow['direction'] + '" /><eco:hasTransferable><eco:Substance><rdfs:label>' + flow['substance'] + '</rdfs:label></eco:Substance></eco:hasTransferable></eco:hasEffect>'
            inventorystr = inventorystr + "<eco:hasQuantity><eco:hasUnitOfMeasure>" + flow["unit"] + "</eco:hasUnitOfMeasure><eco:hasMagnitude>" + flow["value"] + "</eco:hasMagnitude><ecoUD:maxValue>" + flow["max"] + "</ecoUD:maxValue><ecoUD:minValue>" + flow["min"] + "</ecoUD:minValue><ecoUD:maxValue>" + flow["max"] + "</ecoUD:maxValue><ecoUD:ecoUD:standardDeviation95>" + flow["std"] + "</ecoUS:ecoUD:standardDeviation95></eco:hasQuantity>";
            inventorystr = inventorystr + '</eco:hasUnallocatedExchange>';
    return inventorystr
Пример #22
0
def parse_section(section):
    section_data = None
    for section_pattern in section_patterns:
        test_section_data = scrape(section_pattern, section)
        if test_section_data is not None:
            section_data = test_section_data
    if section_data is None:
        #print section
        return {}
        #return section
    recheck = False
    try:
        section_data['start']
    except KeyError:
        pass
    else:
        if ' to ' in section_data['start']:
            section_data['start'], section_data['end'] = section_data['start'].split(' to ')
        #TODO section_patterns: Fix the patterns above to avoid doing this hack
        if 'end' in section_data and \
                section_data['start'].lower().endswith('san') and \
                section_data['end'].lower().startswith('lan to '):
            section_data['start'] = 'Santolan'
            section_data['end'] = section_data['end'].lower().replace('lan to ', '')
    if isinstance(section_data['stat'], list):
        section_data['stat'] = '-'.join(section_data['stat'])
    is_saved = False
    if 'stat' not in section_data:
        #print section
        return {}
        #return section
    return section_data
def fetchresultpage(sessionid,pagenumber,county):
    try:
        result = scrapemark.scrape("""
        <tr nowrap="" align="left" valign="top"></tr>
        {*
                <tr align='left'>
                {*
                <td align='center'></td>
                <td align='center'></td>

                <td>
                    <a href="javascript: OpenDetail('{{ [offenders].uniqueid }}')">
                        {{ [offenders].name }}
                    </a>
                </td>
{#
                <td>
                    {{ [offenders].address }}
                </td>

                <td>{{ [offenders].city }}</td>

                <td align='center'>{{ [offenders].zip }}</td>

                <td>{{ [offenders].county }}</td>

#}                    
                *}
                </tr>
        *}
        """,
        url='http://www.meganslaw.ca.gov/cgi/prosoma.dll?w6='+sessionid+'&searchby=CountyList&SelectCounty='+county+'&SB=0&PageNo='+str(pagenumber))
    except:
        return "Error"
    return result
Пример #24
0
def harvest():
    squeezed=memcache.get('Squeezed::lemons')
    if squeezed is None:
        squeezed=Squeezed.get_by_key_name('squeezed')
        if squeezed is None:
            fresh=(baseurl+lemon['url'] for lemon in lemons())
        else:
            fresh=(baseurl+lemon['url'] for lemon in lemons() if lemon['url'] not in squeezed.lemons)
    else:
        fresh=(baseurl+lemon['url'] for lemon in lemons() if lemon['url'] not in squeezed.lemons)
    bucket=[]
    for lemon in fresh:
        logging.info('squeezing '+lemon)
        juices = scrapemark.scrape("""
            <span class='tpc_title'></span>
            {*
            <img src='{{ [juices].image }}' border=0>
            <a href='{{ [juices].download }}' target=_blank></a>
            *}
            """, url=lemon)['juices']
        logging.info(juices)
        for juice in juices:
            try:
                juice=Juice(key_name=lemon, image=juice['image'], download=juice['download'])
                juice.put()
            except BadValueError:
                logging.info(juice)
        bucket.append(lemon)
    if squeezed is None:
        squeezed=Squeezed(key_name='squeezed', lemons=bucket)
    else:
        squeezed.lemons.extend(bucket)
    squeezed.put()
    memcache.set('Squeezed::lemons', squeezed)
Пример #25
0
def process(thread):
    for url in pages(thread):
        stanza_template=Template(u"""
        {*
        <table id="firstAuthor">
		    <tr>
                <td>
                    <a>${author}</a> &nbsp;发表日期:{{ [stanzas].datetime }}
		        </td>
		    </tr>
	    </table>
	    <div id="pContentDiv">
	        <div class="post">
	        {{ [stanzas].content|html }}
	        </div>
	    </div>
        *}
        {*
        <table>
		    <tr>
                <td>
                    <a>${author}</a> 回复日期:{{ [stanzas].datetime }}
		        </td>
		    </tr>
	    </table>
        <div class="post">
        {{ [stanzas].content|html }}
        </div>
        *}
        """)
        logging.info(thread['author'])
        pattern=scrapemark.compile(stanza_template.substitute(author=thread['author']))
        logging.info(pattern)
        thread['stanzas'][url]=scrapemark.scrape(pattern, url=url, encoding=encoding)['stanzas']
        logging.info(thread['stanzas'][url])
def getbdschools(url):
    return scrapemark.scrape("""
            {*
                  <table>
                        <tr></tr>
                        <tr></tr>
                        {*
                           <tr>
                               <td><a href='{@{*<div class="contactinfonew"></div><div id="heading3"></div>
                                      <div class="addressbar contactinfonew">
                                      <div><div>{{[saddress]}}</div>
                                                <div>tel:{{[sphone]}}</div>
                                      </div>
                                      <div>
                                                <div><a></a><a href={{[sweb]}} target="new"></a></div>
                                                <div><a></a></div>
                                      </div>
                                      <div><a></a><a></a></div>
                                      <div><br>{{[sbrief]}}</div>
                                </div>*} @}'>{{[sname]}}</a></td>
                               <td><div>{{[stype]}}</div></td>
                               <td><div>{{[sgrade]}}</div></td>
                               <td><div>{{[scity]}}</div></td>
                           </tr>
                        *}
               </table>
            *}
            """,url=url)
def getIO(name, urlz):
    #print url
    #html = scraperwiki.scrape(url)
    #soup = BeautifulSoup(html)
    ios = scrapemark.scrape("""
        {*

    <td><font>{{ [io].direction }}</font></td>
    <td><font>{{ [io].ft }}</font></td>
    <td><font>{{ [io].substance }}</font></td>
    <td>{{ [io].value }}</font></td>
    <td>{{ [io].min }}</td>
    <td>{{ [io].max }}</td>
    <td>{{ [io].std }}</td>
    <td><font>{{ [io].unit }}</font></td>
    <td><font>{{ [io].environment }}</font></td>
    <td><font>{{ [io].geo }}</font></td>
    </tr>
        *}
        """,
        url=urlz)
    for flow in ios['io']:
        if flow['direction'] == "Input" or flow['direction'] == "Output":
            scraperwiki.sqlite.execute("insert into SPINEIO values (?,?,?,?,?,?,?,?,?,?,?)", (name,flow['direction'],flow['ft'],flow['substance'],flow['value'],flow['min'],flow['max'],flow['std'],flow['unit'],flow['environment'],flow['geo']))
            scraperwiki.sqlite.commit() 
def main():
    #  Fetch last page index
    last_page = scraperwiki.sqlite.get_var('last_page', default=0)
    #Scrape initial list
    p = scrape(PAGE_LIST_PATTERN, url=LIST_URL)
    # print p
    print 'starting from ' + str(last_page)
    
    # 
    if last_page == 0:
        print 'first page? '
        # Scrape the first list page
        scrape_list(LIST_URL)
    
    # slice from last index
    p = p[last_page:]
    # print p
        
    # Scrape each list page
    for page in p:
        # print 'scraping page : ' + str(page)
        url = "%s&intPageNumber=%d" % (LIST_URL, page)
        # print url
        scrape_list(url)
        # save page index
        scraperwiki.sqlite.save_var('last_page', page-1)
        
    # reset page index to 0
    scraperwiki.sqlite.save_var('last_page', 0)
Пример #29
0
def parse_entry(entry):
    updated_at = entry.updated_at
    # Add 8 hours to consider Asia/Manila timezone
    #updated_at = updated_at + datetime.timedelta(0, 8 * 60 * 60)
    now = datetime.datetime.now()
    if updated_at.day > now.day:
        updated_at = updated_at - datetime.timedelta(1)
    text = entry.text
    text = re.sub('%s[, ]?' % entry.road.name, '', text, flags=re.IGNORECASE)
    text = re.sub('http://twitpic.com/[A-Za-z0-9] ?', '', text, flags=re.IGNORECASE)
    data = None
    # Figure out if the data would make sense.
    for main_pattern in main_patterns:
        test_data = scrape(main_pattern, text)
        if test_data is not None:
            data = test_data
            break
    if data is None:
        return
    # Get the time
    #print entry.road, updated_at.strftime('%d-%H:%M'),
    stat_time = data.get('time', None)
    if stat_time:
        if 'pm' in stat_time.lower():
            add12 = True
        else:
            add12 = False
        try:
            stat_time = datetime.datetime.strptime(stat_time.replace(' ', ''), '%H:%M%p')
        except KeyError, e:
            stat_time = updated_at
        except ValueError, e:
            #print stat_time.replace(' ', ''), e
            stat_time = updated_at
Пример #30
0
def parse_list(resp):
    html = BeautifulSoup(resp.body).prettify()

    members = scrape(
        """{* 
            <tr>
                <td>
                    <a href='{{ [res].idlink }}'>{{ [res].name }}</a>
                    {* <strong>({{ [res].ref }})</strong> *}
                </td>
                <td>
                    <font>partido {{ [res].party }}</font>
                </td>
            </tr>
        *}""",
        html=html)['res']

    # TODO: The president of the chamber may appear only in a footer. Add him
    #       to the members list.

    sel = HtmlXPathSelector(resp)
    trs = sel.select('//tr/td[@align="RIGHT" and @valign="TOP" and @width="5%"]/font/strong/../../..')
    refs = {}
    for tr in trs:
        ref = tr.select('.//strong[starts-with(text(), "(")]/text()')[0].extract()[1:-1]
        sub_info = "".join(tr.select('.//td[2]/font/descendant-or-self::*/text()').extract())
        refs[ref] = sub_info

    items = []
    for info in members:
        #since = None
        #to = None
        why = None
        #substitutes = None
        if 'ref' in info and info['ref'] is not None:
            why = refs[info['ref']]

            #substitutes = sub_info['name']
            #range = get_substitution_range(sub_info['why'])
            #why = get_substitution_reason(sub_info['why'])

            #if len(range) > 0:
            #    since = range[0]
            #if len(range) > 1:
            #    to = range[1]

        date = resp.meta['date']
        id = extract_id_link(info['idlink']) + date.strftime(DATE_FMT)
        items.append(SubstitutesItem(id=id,
                                     date=date,
                                     name=info['name'],
                                     party=info['party'], 
                                     chamber=resp.url[-1],
                                     #substitutes=substitutes,
                                     #substitutes_from=since,
                                     #substitutes_to=to,
                                     substitutes_line=why))

    return items
def scrapeEpisodes(url):
    return scrapemark.scrape("""
        {*
        <td class="summary">"<b>{{ [episode].name }}</b>"</td>
        <span class="bday dtstart published updated">{{ [episode].date }}</span>
        *}
        """,
        url=url)
Пример #32
0
def iter_mailing_list_quake_refs():
    base = "http://lists.geonet.org.nz/pipermail/eqnews/"
    index_urls = scrape(PIPERMAIL_INDEX_PATTERN, url=base)
    index_urls.reverse()
    index_urls = index_urls[
        96:]  ### REMEMBER TO DELETE WHEN FIRST COMPELETE RUN WORKS
    print index_urls
    for month in index_urls:
        print month
        messages = scrape(PIPERMAIL_MONTH_PATTERN, url=base + month)
        messages = [
            base + month.replace('date.html', link) for link in messages
            if '0' in link
        ]
        print messages
        for message in messages:
            yield scrape(PIPERMAIL_MESSAGE_PATTERN, url=message)
def scrapeEpisodes(url):
    return scrapemark.scrape("""
        {*
        <td class="summary">"<b>{{ [episode].name }}</b>"</td>
        <span class="bday dtstart published updated">{{ [episode].date }}</span>
        *}
        """,
                             url=url)
Пример #34
0
def get_values_for_station_and_day(station, date):
    datestring = date.strftime('%d.%m.%Y')
    now = datetime.today()
    url = 'http://luadb.lds.nrw.de/LUA/wiski/pegel.php?stationsname_n=' + station + '&meindatum=' + datestring + '&tabellet=Tabelle'
    br = mechanize.Browser()
    br.set_handle_robots(False)
    br.open(url)
    assert br.viewing_html()
    data = scrapemark.scrape(
        """
            {*
            <td class='messwerte'>{{ [values].datetime }}</td> 
            <td class='messwerte'>{{ [values].value|float }}&nbsp;</td>
            *}
        """,
        br.response().read())
    if 'values' in data:
        datasets = []
        #print data['values']
        for row in data['values']:
            #print station, row['datetime'], ("%.2f" % row['value'])
            # datetime string can be "DD.MM HH:MM" or "HH:MM"
            match1 = re.match(
                r"([0-9]{2})\.([0-9]{2})\s+([0-9]{2}):([0-9]{2})",
                row['datetime'])
            match2 = re.match(r"([0-9]{2}):([0-9]{2})", row['datetime'])
            year = None
            if match1 is not None:
                day = match1.group(1)
                month = match1.group(2)
                year = now.year
                hour = match1.group(3)
                minute = match1.group(4)
                if now.day == 1 and now.month == 1 and day == 31 and month == 12:
                    year = year - 1
            elif match2 is not None:
                day = date.day
                month = date.month
                year = date.year
                hour = match2.group(1)
                minute = match2.group(2)
            if year is not None:
                mez_timestamp = int(
                    datetime(int(year), int(month), int(day), int(hour),
                             int(minute)).strftime('%s'))
                utc_timestamp = mez_timestamp - 3600
                utcdate = datetime.fromtimestamp(utc_timestamp)
                datasets.append({
                    'station':
                    station,
                    'datetime_utc':
                    utcdate.strftime('%Y-%m-%d %H:%S'),
                    'value': ("%.2f" % row['value'])
                })
        scraperwiki.sqlite.save(unique_keys=['datetime_utc', 'station'],
                                data=datasets,
                                table_name="raindata")
        return len(datasets)
Пример #35
0
def main():
    movements = scrape(MOVEMENTS_INDEX, html=get_page(URL))
    print movements
    for m in movements["movements"]:
        if "artcyclopedia.com" in m["link"]:
            movement = scrape(MOVEMENTS_INDIVIDUAL, html=get_page(m["link"]))
            print m["title"]
            if not movement:
                movement = scrape(MOVEMENTS_INDIVIDUAL2, html=get_page(m["link"]))

            relations = []
            for relation in movement["related"]:
                r = dict(movement=m["title"], related_to=relation["topic"])
                if "/artists/" in relation["link"]:
                    r["topic"] = "artist"
                else:
                    r["topic"] = "movement"
                relations.append(r)

            artists = []
            for artist in movement["artists"]:
                artist["movement"] = m["title"]
                dates = artist["alive"].split("-")
                try:
                    artist["birth_year"] = int(dates[0])
                    artist["death_year"] = int(dates[1])
                except ValueError:
                    if "Born" in dates:
                        artist["birth_year"] = int(dates.split()[1])
                        artist["death_year"] = None
                except:
                    print >> sys.stderr, "ERROR: Can't parse dates for %s: %s" % (artist["name"], artist["alive"])
                    artist["birth_year"] = None
                    artist["death_year"] = None
                artist["profile_link"] = URL + artist["profile_link"][3:]
                try:
                    artist["nationality"], artist["profession"] = artist["artist_type"].split(" ", 1)
                except ValueError:
                    artist["nationality"] = artist["artist_type"]
                    artist["profession"] = "unknown"

                artists.append(artist)
            datastore.save(["name"], table_name="movements", data=dict(name=m["title"], link=m["link"]))
            datastore.save(["movement", "related_to"], table_name="relations", data=relations)
            datastore.save(["name", "nationality"], table_name="artists", data=artists)
def iter_recent_quakes():
    for quake in scrape(pattern=RECENT_QUAKES_PATTERN,url=URL)['quakes']:
        quake['url'] = 'http://www.geonet.org.nz' + quake['url']
        quake['shaking_map_url'] = 'http://www.geonet.org.nz' + quake['shaking_map_url']
        quake['maps_url'] = 'http://www.geonet.org.nz' + quake['maps_url']
        quake['img_of_quake_location_url'] = 'http://www.geonet.org.nz' + quake['img_of_quake_location_url']
        for k, val in fetch_quake_data(quake['geonet_ref']).iteritems():
            quake[k] = val
        yield quake
def scrape_list(url):
    #html = mech_scrape(url)
    p = scrape(EST_PATTERN, url=url)
    print p
    for e in p:
        est_url = "%s%s%d" % (BASE_URL, DETAIL_URL, e)
        print 'scraping: ' + est_url
        print 'scraping id: ' + str(e)
        scrape_detail(est_url, e)
Пример #38
0
 def fetch_load_url(self):
     pattern = '''
         <title>{{ pagetitle }}</title>
         '''
     
     dict = scrapemark.scrape(pattern, url=self.url)
     
     self.html_title = dict['pagetitle']
     self.fetched_url = True
Пример #39
0
def parse_list(resp):
    html = BeautifulSoup(resp.body).prettify()

    members = scrape(
        """{* 
            <tr>
                <td>
                    <a href='{{ [res].idlink }}'>{{ [res].name }}</a>
                    {* <strong>({{ [res].ref }})</strong> *}
                </td>
                <td>
                    <font>partido {{ [res].party }}</font>
                </td>
            </tr>
        *}""",
        html=html)['res']

    # TODO: The president of the chamber may appear only in a footer. Add him
    #       to the members list.

    sel = HtmlXPathSelector(resp)
    trs = sel.select('//tr/td[@align="RIGHT" and @valign="TOP" and @width="5%"]/font/strong/../../..')
    refs = {}

    for tr in trs:
        ref = tr.select('.//strong[starts-with(text(), "(")]/text()')[0].extract()[1:-1]
        refs[ref] = tr

    items = []
    for info in members:
        since = None
        to = None
        line = None
        substitutes_name = None
        substitutes_oid = None
        if 'ref' in info and info['ref'] is not None:
            try:
                tr = refs[info['ref']]
            except KeyError:
                logger.warning('Couldnt find reference %s in substitutes table.' % \
                               info['ref'], exc_info=sys.exc_info())
            line = "".join(tr.select('.//td[2]/font/descendant-or-self::*/text()').extract())
            links = tr.select('.//a')
            if links:
                substitutes_oid = extract_id_link(links[0].select('.//@href').extract()[0])[2:]
                substitutes_name = links[0].select('.//text()').extract()[0]
            range = get_substitution_range(line)
            if len(range) > 0:
                try:
                    since = datetime.strptime(range[0], PAGE_DATE_FMT).date()
                except ValueError, e:
                    logger.warning("Unable to parse substitute 'since' date", exc_info=sys.exc_info())
            if len(range) > 1:
                try:
                    to = datetime.strptime(range[1], PAGE_DATE_FMT).date()
                except ValueError, e:
                    logger.warning("Unable to parse substitute 'to' date", exc_info=sys.exc_info())
def scrape_detail(est_url, id):
    
    html = scraperwiki.scrape(est_url)
    est_details = scrape(DETAIL_PATTERN, html)

    if not est_details:
        #Try the exempt pattern
        est_details = scrape(EXEMPT_PATTERN, html)
        
        if not est_details:
            # it's either changed hands and will turn up soon, or it's new
            return
    else:
        # print est_details['inspection_date']
        est_details['inspection_date'] =  datetime.strftime(datetime.strptime(est_details['inspection_date'], '%d/%m/%Y'), '%Y-%m-%d')
        # parser.parse(est_details['inspection_date'])
        # print est_details['inspection_date']        

    # Locate
    # Attempt to find
    sql = 'lat, lng FROM swdata WHERE address = "%s" AND lat IS NOT NULL LIMIT 0,1' % est_details['address']
    latlng = scraperwiki.sqlite.select(sql)
    
    #Avoid multiple google lookups
    if latlng:
        # print 'DB Geo'
        # print latlng
        est_details['lat'] = latlng[0]['lat']
        est_details['lng'] = latlng[0]['lng']
        # print est_details['lat']
    else:
        # print 'Goog lookup'
        location = locate(est_details['address'] + ', Auckland, NZ')
        if location:
            est_details['lat'], est_details['lng'] = location 
        

    #est_details['fg_id'] = id  # Gah! id aint unique??
    #est_details['url'] = est_url # URLs are useless - the IDs float!!?? WTF!?
    
    
    # Save
    scraperwiki.sqlite.save(unique_keys=['name','address','grade','inspection_date'], data=est_details)
    print 'saved'
def getEachRecord(name, urlz):
    #print url
    #html = scraperwiki.scrape(url)
    #soup = BeautifulSoup(html)
    #date = soup.find(text="Date Completed").parent.parent.parent.nextSibling.nextSibling.text
    #print date
    inventory = {}
    temp = scrapemark.scrape("""
        {*
    <th align="left" valign="top" width="257" bgcolor="#DCDEE0"><font style="font-size: 11px;" face="Verdana"><em>Date Completed</em></font></th>
    <td valign="top" width="477" bgcolor="#FFFFFF"><font style="font-size: 11px;">{{ date }}</font>
        *}
        """,
        url=urlz)
    inventory['date'] = temp['date']
    temp = scrapemark.scrape("""
        {*
    <th align="left" valign="top" width="257" bgcolor="#DCDEE0"><font style="font-size: 11px;" face="Verdana"><em>Copyright</em></font></th>
    <td valign="top" width="477" bgcolor="#FFFFFF"><font style="font-size: 11px;">{{ copyright }}</font>
        *}
        """,
        url=urlz)
    inventory['copyright'] = temp['copyright']
    temp = scrapemark.scrape("""
        {*
    <th align="left" valign="top" width="257" bgcolor="#DCDEE0"><font style="font-size: 11px;"><em>Process Type</em></font></th>
    <td valign="top" width="477" bgcolor="#FFFFFF"><font style="font-size: 11px;">{{ desc }}</font>

        *}
        """,
        url=urlz)
    inventory['description'] = temp['desc']
    temp = scrapemark.scrape("""
        {*
    <th align="left" valign="top" width="257" bgcolor="#DCDEE0"><font style="font-size: 11px;"><em>Function</em></font></th>
    <td valign="top" width="477" bgcolor="#FFFFFF"><font style="font-size: 11px;">{{ desc }}</font>

        *}
        """,
        url=urlz)
    inventory['description'] = inventory['description'] + ". " + temp['desc']
    scraperwiki.sqlite.execute("insert into SPINE values (?,?,?,?)", (name,inventory['date'],inventory['description'],inventory['copyright']))
    scraperwiki.sqlite.commit()     
def get_csv_link():
    """ Return up-to-date csv link """
    csv_pattern = """{*<a href="{{ href }}" title="Click here to download this file." target="blank">
    ECE Directory .csv*}</a>"""
    det = scrape(
        csv_pattern,
        url=
        'http://www.educationcounts.govt.nz/directories/early-childhood-services'
    )
    return 'http://www.educationcounts.govt.nz%s' % det['href']
def swift_codes(queue=Q):
    print 'Getting countries'
    raw = GET(SWIFT_URL)
    print raw
    countries = scrape(COUNTRY_PATTERN, html=raw, headers=HEADERS)['countries']
    print countries
    for country in countries:
        print country
        country['link'] = BASE_URL + country['link']
        queue.push((parse_swift_code_page, country['link'], country['name']))
def swift_codes(queue=Q):
    print 'Getting countries'
    raw = GET(SWIFT_URL)
    print raw
    countries = scrape(COUNTRY_PATTERN, html=raw, headers=HEADERS)['countries']
    print countries
    for country in countries:
        print country
        country['link'] = BASE_URL + country['link']
        queue.push((parse_swift_code_page, country['link'], country['name']))
def get_values():
    """Get actual readings from the stations"""
    baseurl = 'http://www.hlug.de/static/pegel/static/'
    listpageurl = baseurl + "list_N_0.htm?entryparakey=N"
    br = mechanize.Browser()
    br.set_handle_robots(False)
    br.open(listpageurl)
    assert br.viewing_html()
    links = []
    for link in br.links(url_regex=".*stat_[0-9]+.htm\?entryparakey=N"):
        links.append(link.url)
    links = shuffle(links)
    for link in links:
        subpageurl = baseurl + link
        print "Fetching", subpageurl
        br.open(subpageurl)
        html = br.response().read()
        station = scrapemark.scrape("""
                <table class="wwp_sdheader" cellpadding=4 width=720>
                    <tr>
                        <td class="wwp_sdheader" colspan=6>Station</td>
                    </tr>
                    <tr>
                        <td class="head">Name</td><td class="td1">{{ name }}</td>
                        <td class="head">Messstellen-Nr.</td><td class="td1">{{ id|int }}</td>
                        <td class="head">Flussgebiet</td><td class="td1">{{ river }}</td>
                    </tr>
                </table>
                <a target="_blank" class="graphlink" href="data_{{ linkid }}_N_WEEK.xls">4-Tage</a>
            """,
            html)
        #print station
        if station is not None and 'linkid' in station:
            excelurl = baseurl + 'data_'+ station['linkid'] +'_N_WEEK.xls'
            print excelurl
            book = xlrd.open_workbook(file_contents=urllib.urlopen(excelurl).read())
            if book:
                sheet = book.sheets()[0]
                if sheet.ncols == 2 and sheet.nrows > 0:
                    values = []
                    for rownumber in range(3, sheet.nrows): # skip first 3 rows
                        (datecell, numcell) = [ sheet.cell(rownumber, j)  for j in range(sheet.ncols) ]
                        #print "%s, %.1f" % (datecell.value, numcell.value)
                        match = re.match(r"([0-9]{2})\.([0-9]{2})\.([0-9]{4})\s([0-9]{2}:[0-9]{2})", datecell.value)
                        if match is not None:
                            values.append({
                                'datetime': match.group(3) + '-' + match.group(2) + '-' + match.group(1) + ' ' + match.group(4),
                                'station_id': station['id'],
                                'rain_mm': ("%.1f" % numcell.value),
                                
                            })
                        #print values
                    scraperwiki.sqlite.save(unique_keys=['datetime', 'station_id'], data=values, table_name="raindata")
        else:
            print "WARN: No workable data found."
Пример #46
0
    def post(self):
        logging.debug('SyncHandler.post')
        scrape = scrapemark.scrape("""
                    {* <tr class='metalist'><td><a href='{{[details]}}'></a></td></tr> *}
                """,
                                   url=LIST_URL)

        for detailurl in scrape['details']:
            taskqueue.add(url='/tasks/item', params={'url': detailurl})

        self.redirect('/')
Пример #47
0
def GetPage(fileid):

    try:
        fin = urllib2.urlopen(base_url + fileid)
        text = fin.read()
        fin.close()

        pprint(text)

        #test for no match
        no_match = (scrape("""
<hr>There are {{ }} that match your search criteria.<br>
            """,
                           html=text))
        print no_match
        #TODO: Save no match
        #if no_match == "no entries":

        #basic details:
        basic_details = (scrape("""
<span class=detailstext>Registration Number: {{ [y].reg_no }}</span><P><span class=detailstext>Date Registered:&nbsp;</span>{{ [y].reg_date }}&nbsp;&nbsp;&nbsp;&nbsp;<span class=detailstext>Registration Expires:&nbsp;</span>{{ [y].reg_expiry }}<br><br><span class=detailstext>Data Controller:&nbsp;</span>{{ [y].data_controller }}<P><div class=detailstext>Address:</div><Blockquote>{{ [y].reg_address|html }}</BlockQuote><hr>
            """,
                                html=text))
        print basic_details

        debug((len(basic_details['y']), "items found"))
        debug(basic_details['y'])

        #foi:
        foi = (scrape("""
<P ALIGN=center class=detailstext>{{ }} or a Scottish public authority
            """,
                      html=text))
        print foi
        #if foi == "Freedom of Information Act 2000":

#<P class=detailstext>Other Names:</P><BlockQuote>FIRST MONEY DIRECT<br>FIRSTMONEYDIRECT.CO.UK<br></BlockQuote></BlockQuote><hr>

    except Exception, e:
        print e
        return
Пример #48
0
def parse_search_results(url, first=False):
    pattern = """{*
<td>{{ [lobbyists]].type }}:<strong>{{ [lobbyists]].name }}</strong>

{{ [lobbyists].lobbyist_details|html }}<a href="{{ [lobbyists].communication_reports_link|abs }}">View communication reports</a>
</td>
          <td class="tableTop">          
            <a href="{{ [lobbyists].registration_link|abs }}>
              {{ [lobbyists].registration_begining }}to{{ [lobbyists].registration_ending }}
            </a>
          </td>
*}

{* <a href="{{ next|abs }}">Next</a> *}
"""
    if first:
        res = scrape(pattern=pattern, url=url, post=params, cookie_jar=CJ)
    else:
        res = scrape(pattern=pattern, url=url, cookie_jar=CJ)
    print res
    lobbyists = res['lobbyists']
    next_page_url = res['next']
    print next_page_url
    for lobbyist in lobbyists:
        details = html.fromstring(lobbyist['lobbyist_details'])
        if lobbyist['type'] == u'Consultant':
            lobbyist['consulting_firm'] = details[1].text
            lobbyist['client'] = details[3].text
            lobbyist['lobbyist_id'] = details[4].tail.strip()
        elif lobbyist['type'] == u'In-house Organization' or lobbyist[
                'type'] == u'In-house Corporation':
            lobbyist['responsible_officer'] = ' '.join(
                part.strip() for part in details[1].text.split())
            lobbyist['lobbyist_id'] = details[2].tail.strip()
        else:
            print 'CRAZINESS: new type found: ', lobbyist['type'],
            print lobbyist
            raise ValueError
        del lobbyist['lobbyist_details']
        Q.put((comms_report_index, lobbyist['communication_reports_link']))
        Q.put((registration, lobbyist['registration_link']))
Пример #49
0
	def getDefinition(self, html):
		definition  = scrape(""" 
		<table border=0 cellspacing=10 cellpadding=0 width=100%> 
			<tr> 
				<td> 			
					{{ }}
				</td> 
			</tr> 
		</table> 
		""", html)
		
		return definition
Пример #50
0
def iter_recent_quakes():
    for quake in scrape(pattern=RECENT_QUAKES_PATTERN, url=URL)['quakes']:
        quake['url'] = 'http://www.geonet.org.nz' + quake['url']
        quake['shaking_map_url'] = 'http://www.geonet.org.nz' + quake[
            'shaking_map_url']
        quake['maps_url'] = 'http://www.geonet.org.nz' + quake['maps_url']
        quake[
            'img_of_quake_location_url'] = 'http://www.geonet.org.nz' + quake[
                'img_of_quake_location_url']
        for k, val in fetch_quake_data(quake['geonet_ref']).iteritems():
            quake[k] = val
        yield quake
Пример #51
0
def fetchsession():
    global fetchnumber
    global sessionid
    if (fetchnumber > 50) or (sessionid == ""):
        sessionurl = 'http://www.meganslaw.ca.gov/cgi/prosoma.dll?searchby=curno'
        result = scrapemark.scrape("{{ page.text }}", url=sessionurl)
        sessionid = str(result['page']['text'])
    if fetchnumber <= 50:
        fetchnumber += 1
    else:
        fetchnumber = 0
    return sessionid
def GetListOfLtt():

    ltt = (scrape("""
        <table>
        {*
            <td>{{ [y].ltt_id }} withdrawn</td>
        *}
        </table>
        """,
                  url=base_url))

    if ltt != None:
        if 'y' in ltt:
            debug((len(ltt['y']), "items found"))
            debug(ltt['y'])
            for k in ltt['y']:
                k['ltt_status'] = "WITHDRAWN"
                k['date_scraped'] = ''
                scraperwiki.sqlite.save(unique_keys=["ltt_id"],
                                        data=k,
                                        table_name="ltt_data")

    ltt = (scrape("""
        <table>
        {*
            <td><a href='{{ [y].ltt_url|abs }}'>{{ [y].ltt_id }}</a></td>
        *}
        </table>
        """,
                  url=base_url))

    if ltt != None:
        if 'y' in ltt:
            debug((len(ltt['y']), "items found"))
            debug(ltt['y'])
            for k in ltt['y']:
                k['ltt_status'] = "ACTIVE"
                k['date_scraped'] = ''
                GetLtt(k['ltt_url'])