예제 #1
0
def processPDFLink(df):
    #testing
    #url = "http://www.silicontao.com/ProgrammingGuide/other/beejnet.pdf"

    for datarow in df:
        if datarow['whitepaper'][-3:] == 'pdf':
            URL = datarow['whitepaper']
            req = urllib.request(URL, headers={'User-Agent' : "Magic Browser"}) 
            remote_file = urllib.urlopen(req).read()
            memory_file = io.BytesIO(remote_file)

            read_pdf = PyPDF2.PdfFileReader(memory_file)
            number_of_pages = read_pdf.getNumPages()
            for i in range(0, number_of_pages):
                pageObj = read_pdf.getPage(i)
                page = pageObj.extractText()
                #print (page)
        elif datarow['whitepaper']:
            URL = datarow['whitepaper']
            htmlString = urllib.request.urlopen(URL).read()
            html = BeautifulSoup(htmlString, 'html.parser')
            texts = html.findAll(text=True)
            visible_texts = filter(tag_visible, texts)
            page = " ".join(t.strip() for t in visible_texts)
        else:
            print("{0} has no whitepaper!".format(datarow['name']))

    outputStream = open("output.pdf","wb")
    writer.write(outputStream)
    outputStream.close()
 def check_registration_time(self):
     html = self.session.post(
         'https://mystudentrecord.ucmerced.edu/pls/PROD/bwskrsta.P_RegsStatusDisp',
         data={
             'term_in': self.term
         }).text
     soup = BeautifulSoup(html, "html.parser")
     html = soup.find('table', attrs={'class': 'datadisplaytable'})
     times = html.findAll('td')
     date = times[0].text
     time = times[1].text
     tzinfos = {"PST": gettz("America/Los_Angeles")}
     return parse("{} {} {}".format(date, time, "PST"), tzinfos=tzinfos)
예제 #3
0
        rs = rs.strip("\r")
        rs = rs.strip("\n")
        return (rs)
    else:
        return ''


pagine = range(1, 97)

for count in pagine:
    try:
        searchURL = "http://www.comuni-italiani.it/cap/%02d.html" % count
        print searchURL
        html = scraperwiki.scrape(searchURL)
        html = BeautifulSoup(html, convertEntities=BeautifulSoup.HTML_ENTITIES)
        tables = html.findAll('table')

    except:
        print "error for WKI_ID:" + str(id)

    rows = tables[5].findAll('tr')
    li = {}

    for row in rows:
        try:
            cells = row.findChildren('td')
            if len(cells) <= 4:
                li['CAP'] = cells[0].text
                li['COMUNE'] = cells[1].text
                #print cells
                #print cells[1]
# for other purposes.

import scraperwiki
import urllib2
import urllib
import lxml.html
import BeautifulSoup as bs
import re
import xml.etree.ElementTree as ET

for i in range(2380, 5000):
    url = 'http://www.dlapiper.com/us/people/detail.aspx?attorney=' + str(i)
    try:
        page = scraperwiki.scrape(url)
        html = bs.BeautifulSoup(page)
        lawer_name = html.findAll('h1')
        lawer_post = html.findAll('h2')
        root = lxml.html.fromstring(page)
        links = root.cssselect("div.bio a")
        contact = root.cssselect("div.bio td")
        data = {
            'name': lawer_name[4].getText(),
            'post': lawer_post[3].getText(),
            'email': links[0].text,
            'contact': contact[0].text_content(),
            'id': i
        }
        scraperwiki.sqlite.save(unique_keys=['id'], data=data)
    except:
        pass
예제 #5
0
    for content in mainContent:
        if content !="http://www.dlapiper.com/us/people/#":
            totalLinks.append(content)




i=0;
for url in totalLinks:
    if i<=481:
        i=i+1
        continue
    try:           
        page = scraperwiki.scrape(url)
        html = bs.BeautifulSoup(page)
        lawer_name = html.findAll('h1')
        lawer_post = html.findAll('h2')
        root = lxml.html.fromstring(page)
        links = root.cssselect("div.bio a")
        contact = root.cssselect("div.bio td")
        data = {'name':lawer_name[4].getText(), 'post' : lawer_post[3].getText(), 'email' : links[0].text, 'contact':contact[0].text_content(),'id':i}
        scraperwiki.sqlite.save(unique_keys=['id'], data=data)
        i=i+1
    except:
        pass
                                
    
    
    
    
# PLEASE READ THIS BEFORE EDITING
예제 #6
0
def extract_data():
    #soup = BeautifulSoup(f)
    #matches = soup.find(id = "zapasVypis")

    #matches = matches.findAll('tr')
    output_structure = []
    counter = 0
    seasons = range(2015, 2016)
    URL = "http://www.hc-kometa.cz/zapas.asp?stats=true&sezona=%s&kde="
    for season in seasons:
        html = BeautifulSoup(get_webpage(URL % season).decode('windows-1250').encode('utf-8'))
        matches = html.findAll('tr')
        for i in range(1, len(list(matches)), 2): 
            temp_hash = {}
            detailed_structure = parse_detailed_info(matches[i+1])
            match_structure = parse_match_info(matches[i])
            output_structure.append([detailed_structure, match_structure])
            #if i == 11:
            #    break
        root = etree.Element('kometa')
        season_element = etree.Element('season')
        season_element.attrib['year'] = str(season)

        for data in output_structure:
            match = etree.Element('match')
        
            match_round = etree.Element('round')
            match_round.text = strip_accents(data[1]['round'])
            home_element = etree.Element('home')
            home_element.text = str(data[1]['is_it_home'])
            participants_element = etree.Element('participants')
            parti_home = etree.Element('home')
            parti_home.text = strip_accents(data[1]['home_team'])
           
            parti_visi = etree.Element('visitors')
            parti_visi.text = strip_accents(data[1]['visitor_team'])
            participants_element.append(parti_home)
            participants_element.append(parti_visi)
            
            result_element = etree.Element('result')
            parti_home = etree.Element('visitors')
            parti_home.text = str(1)
            parti_visitor = etree.Element('home')
            parti_visitor.text = str(2)
            result_element.append(parti_home)
            result_element.append(parti_visitor)
            
            part_result = etree.Element('endOfPeriods')
            date = etree.Element('date')
            date.text = strip_accents(data[1]['date'])
            
            referees = etree.Element('referees')
            main = etree.Element('mainReferee')
            main.text = strip_accents(data[0]['main_referee'])
            line = etree.Element('assistantReferees')
            temp_line = etree.Element('assistantReferee')
            temp_line.text = strip_accents(data[0]['line_referee'][0])
            line.append(temp_line)
            temp_line1 =  etree.Element('assistantReferee')
            try:
                temp_line1.text = strip_accents(data[0]['line_referee'][1])
            except:
                temp_line1.text = '0'
            line.append(temp_line1)
            video = etree.Element('videoReferee')
            video.text = strip_accents(data[0]['video_referee'])
            
            referees.append(main)
            referees.append(line)
            referees.append(video)
        
            penalty = etree.Element('penalty')
            penalty_home = etree.Element('home')
            penalty_visitor = etree.Element('visitor')
            penalty_home.text = data[0]['disqualification'][0]
            try:
                penalty_visitor.text = strip_accents(data[0]['disqualification'][1])
            except IndexError:
                penalty_visitor.text = ''
            penalty.append(penalty_home)
            penalty.append(penalty_visitor)
        
            powerplay = etree.Element('powerplay')
            powerplay_home = etree.Element('home')
            powerplay_visitor = etree.Element('visitor')
            powerplay_home.text = data[0]['success_rate'][0]
            powerplay_visitor.text = data[0]['success_rate'][0]
            
            powerplay.append(powerplay_home)
            powerplay.append(powerplay_visitor)
        
            shorthanded = etree.Element('shorthandedGoal')
            shorthanded_home = etree.Element('home')
            shorthanded_visitor = etree.Element('visitor')
            try:
                shorthanded_home.text = data[0]['shorthanded'][0]
                shorthanded_visitor.text = data[0]['shorthanded'][1]
            except IndexError:
                shorthanded_visitor.text = str(0)
            shorthanded.append(shorthanded_home)
            shorthanded.append(shorthanded_visitor)
        
            shots_on_goal = etree.Element('shotsOnGoal') 
            shots_on_goal_home = etree.Element('home')
            shots_on_goal_visitor = etree.Element('visitor')
            shots_on_goal_home.text = data[0]['shots_on_net'][0]
            shots_on_goal_visitor.text = data[0]['shots_on_net'][1]
            shots_on_goal.append(shots_on_goal_home)
            shots_on_goal.append(shots_on_goal_visitor)
        
            audience = etree.Element('audience')
            audience.text = strip_accents(data[0]['visitors'])
            players = etree.Element('players')
        
            players_data = data[1]['players_stats']
        
            for player in players_data:
                player_element = etree.Element('player')
                player_element.attrib['id'] = str(player['player_id'])
                name = etree.Element('name')
                name.text = strip_accents(str(player['name']))
                goals = etree.Element('goals')
                goals.text = str(player['goals'])
                assistances = etree.Element('assistances')
                assistances.text = str(player['asistances'])
                tm = etree.Element('tm')
                tm.text = str(player['tm'])
                radegast = etree.Element('radegastIndex')
                radegast.text = str(player['bilance'])
                
                player_element.append(name)
                player_element.append(goals)
                player_element.append(assistances)
                player_element.append(tm)
                player_element.append(radegast)
        
                players.append(player_element)
        
            match.append(match_round)
            match.append(home_element)
            match.append(participants_element)
            match.append(result_element)
            match.append(part_result)
            match.append(date)
            match.append(referees)
            match.append(penalty)
            match.append(powerplay)
            match.append(shorthanded)
            match.append(shots_on_goal)
            match.append(audience)
            match.append(players)  
            season_element.append(match)
        
        root.append(season_element)
        
        string = etree.tostring(root , pretty_print=True)
        f = open(XML_LOCATION, 'w')
        f.write(string)
        f.close()

        print string
예제 #7
0
def scapper_tsv(n_links, path_html, path_tsv):

    # loading bar
    with tqdm(total=n_links) as pbar:

        for article in range(n_links):  # range of files
            try:
                html = BeautifulSoup(
                    open(path_html + "/article_{}.html".format(article)),
                    'html.parser')
            except Exception as e:  # if article doesn't exists
                print(article, e)
                continue

            title = html.select("h1")[0].text

            # initialize tmp as intro
            tmp = 'intro'
            sections = {'intro': '', 'plot': ''}

            # take all paragraphs section by section and save only intro and plot
            for section in html.select('div.mw-parser-output > *'
                                       ):  # take only notes in the first level
                if (section.name == 'p' and tmp == 'intro'):
                    sections['intro'] += section.text.strip()

                # chage tmp on section names
                if (section.name in ['h2', 'h3']):
                    tmp = section.span['id']

                # take only sections we are interrested in
                if (section.name == 'p' and tmp in [
                        'Plot', 'Plot_summary', 'Premise'
                ]):  # check different names for plot sections
                    sections['plot'] += section.text.strip()

            # we doesn't take in consideration pages without Plot
            if (sections['plot'] == ''):
                print(article, 'No Plot')
                continue

            # dictionary for infobox
            d = {
                'film_name': title,
                'Directed by': 'NA',
                'Produced by': 'NA',
                'Written by': 'NA',
                'Starring': 'NA',
                'Music by': 'NA',
                'Release date': 'NA',
                'Running time': 'NA',
                'Country': 'NA',
                'Language': 'NA',
                'Budget': 'NA'
            }

            # take elem from infobox
            info_box = html.findAll(['th', 'td'])
            for elem in info_box:
                info = elem.text.strip('\n')  # take text from the table
                if info in d:
                    d[info] = info_box[info_box.index(elem) +
                                       1].text.strip('\n')

            # select elem in oroder as a list to save in .tsv
            ld = list(d.values())
            columns = [
                'title', 'intro', 'plot', 'film_name', 'Directed by',
                'Produced by', 'Written by', 'Starring', 'Music by',
                'Release date', 'Running time', 'Country', 'Language', 'Budget'
            ]
            data = [title, sections['intro'], sections['plot']] + ld[0:]

            # create and save a tsv
            with open(path_tsv + '/article_{}.tsv'.format(article),
                      'w',
                      newline='',
                      encoding='utf-8') as f_output:
                tsv_output = csv.writer(f_output, delimiter='\t')
                tsv_output.writerow(columns)
                tsv_output.writerow(data)

            pbar.update(1)
예제 #8
0
def filter( html ):
    imgs = html.findAll( "img" )
    if imgs:
        return imgs
    else:
        sys.exit("[~] No images detected on the page.")
예제 #9
0
def getHyperlinks(html):
	return [(a.text, a['href']) for a in html.findAll("a")]

scraperwiki.sqlite.attach("elencocapitalia")
data = scraperwiki.sqlite.select(           
    '''distinct CAP from ElencoCapItalia.swdata 
   '''
)

for d in data:
    searchURL = "http://www.youinweb.it/profiles_it/" + d["CAP"]
    
    try:
        html = scraperwiki.scrape(searchURL)
        time.sleep(1) # don't overload
        html = BeautifulSoup(html,convertEntities=BeautifulSoup.HTML_ENTITIES)
        tables = html.findAll('table')
    except:
        sys.exc_clear()
        
            
    rows = tables[0].findAll('tr')
    li = {}
    for row in rows:
        try:
            cells = row.findChildren('td')

            #print cells[1].text
            URL = searchURL+get_href (cells[1])
            #print URL
            li['CAP'] = d["CAP"]
            li['URL'] = URL