Exemplo n.º 1
0
 def page_extract(self, html):
     actors = []
     for div in html.cssselect(
             'div.lister-list div.lister-item.mode-detail'):
         try:
             actor = {}
             actor[keys.entity_rank] = parse.csstext(
                 div.cssselect('span.lister-item-index.unbold.text-primary')
                 [0]).split('.')[0]
             actor[keys.entity_pic] = div.cssselect(
                 'div.lister-item-image a img')[0].attrib['src']
             actor[keys.entity_name] = parse.csstext(
                 div.cssselect('h3.lister-item-header a')[0]).strip()
             actor[keys.entity_profile] = fixed.clean_url(
                 'http://www.imdb.com' +
                 div.cssselect('div.lister-item-image a')[0].attrib['href'])
             actor[keys.entity_position] = parse.csstext(
                 div.cssselect('p.text-muted.text-small')[0]).split(
                     '|')[0].strip()
             actor[keys_hollywood.noted] = parse.csstext(
                 div.cssselect('p.text-muted.text-small a')[0]).strip()
             actor[keys_hollywood.noted_profile] = fixed.clean_url(
                 'http://www.imdb.com' + div.cssselect(
                     'p.text-muted.text-small a')[0].attrib['href'])
             print actor
             actors.append(actor)
         except Exception as e:
             print 'page_extract exception:', e
     return actors
Exemplo n.º 2
0
 def search_ufc(self, players):
     print 'ufc_find:', len(players)
     for i, p in enumerate(players):
         print 'p:', i, 'of', len(players)
         if keys.entity_profile not in p.keys():
             search_term = p[keys.entity_name] + ' site:www.ufc.com'
             d = self.cv.bing(search_term)
             google_results = yield d
             print 'google results:', google_results
             try:
                 profile = [
                     result for result in google_results
                     if result.startswith('http://www.ufc.com/fighter')
                 ][0]
                 profile = profile.split('?')[0]
                 print 'profile:', profile
                 if not fixed.clean_url(profile).endswith('/media'):
                     p[keys.entity_profile] = fixed.clean_url(
                         profile.replace('%20', '')).lower()
                     d = self.cv.goto_url(p[keys.entity_profile])
                     d.addCallback(self.cv.to_html)
                     d.addCallback(self.parse_fighter, p)
                     yield d
             except:
                 print 'missing on ufc.com:', p[keys.entity_name]
     defer.returnValue([
         p for p in players if keys.entity_profile in p
         and 'weight_class' not in p[keys.entity_profile]
     ])
Exemplo n.º 3
0
 def pullteam(self, h, players = []):
     doc = html.document_fromstring(h)
     for tr in doc.cssselect('a[title="List of sovereign states"]'):
         tr = tr.getparent().getparent()            
         while tr.getnext() is not None:
             tr = tr.getnext()
             try:
                 country = parse.csstext(tr.cssselect('th a')[0])
                 if country:
                     for td in tr.cssselect('td'):
                         if len(td.cssselect('a')) == 2:
                             try:
                                 player = {}
                                 player[keys.entity_team] = 'World Leaders'
                                 player[keys.entity_position] = parse.csstext(td.cssselect('a')[0]).split('\xc2')[0]
                                 player[keys.entity_name] = parse.csstext(td.cssselect('a')[1])
                                 player[keys.entity_profile] = fixed.clean_url('http://en.wikipedia.org' + td.cssselect('a')[1].attrib['href'])
                                 player[keys.entity_country] = country
                                 players.append(player)
                             except Exception as e:
                                 print 'world leader exception:', e                        
             except Exception as e:
                 print 'world leader exception:', e
     players.append({keys.entity_twitter: 'UN', keys.entity_profile: 'team:World Leaders'})
     return players
Exemplo n.º 4
0
 def callbackExtractGovernors(self, h):
     try:
         governors = []
         doc = html.document_fromstring(h)
         h2 = doc.cssselect('h2 span[id="State_governors"]')[0].getparent()
         while h2.tag != 'table':
             h2 = h2.getnext()
         #n = doc.xpath('/html/body/div[3]/div[4]/div[4]/div/table[1]')[0]
         for tr in h2.cssselect('tr'):
             g = {}            
             try:
                 
                 g[keys.entity_team] = 'Governors'
                 g[keys.entity_flag] = 'http:' + tr[0].xpath('div[1]/a/img')[0].attrib['src']
                 g[keys.entity_state] = tr[0].xpath('div[2]/a')[0].text
                 try:
                     g[keys.entity_pic] = 'http:' + tr[1].find('a').find('img').attrib['src']
                 except:
                     pass
                 g[keys.entity_name] = tr[2].cssselect('center span.vcard a')[0].attrib['title']
                 #t = tr[2].find(".//a")
                 g[keys.entity_profile] = fixed.clean_url('http://en.wikipedia.org' + tr[2].cssselect('center span.vcard a')[0].attrib['href'])                
                 g[keys.entity_party] = tr[4].find('a').text
                 g[keys.entity_prior_exp] = parse.csstext(tr[5])
                 g[keys.entity_assumed_office] = parse.csstext(tr[6])
                 g[keys.entity_term_expires] = parse.csstext(tr[7])
                 governors.append(g)
             except:
                 pass
         governors.append({ keys.entity_twitter: 'NatlGovsAssoc', keys.entity_profile: 'team:Governors'})
         return governors
     except Exception as e:
         print e
Exemplo n.º 5
0
 def get_skater(self, html, community):
     for tr in html.cssselect('table.vitals.vitalsshrink tr')[1:]:
         skater = {}
         skater[keys.entity_rank] = parse.csstext(tr[0])[:-2]
         skater[keys.entity_profile] = fixed.clean_url(
             tr[1].cssselect('a')[0].attrib['href'])
         skater[keys.entity_name] = parse.csstext(
             tr[2].cssselect('a')[0]).replace(',', '')
         skater[keys.entity_country] = parse.csstext(
             tr[2].cssselect('a')[1])
         try:
             skater[keys.entity_age] = tr.cssselect(
                 'h3 br')[0].tail.strip().split(' ')[1]
             skater[keys.entity_points] = tr.cssselect(
                 'h3 br')[1].tail.strip().split(' ')[1]
         except:
             try:
                 skater[keys.entity_points] = tr.cssselect(
                     'h3 br')[0].tail.strip().split(' ')[1]
             except:
                 pass
         pic_url = 'https://theboardr.blob.core.windows.net/headshots/' + skater[
             keys.entity_profile].split('/')[4] + '_900.jpg'
         check = requests.head(pic_url,
                               headers={
                                   'User-Agent': 'curl/7.35.0',
                                   'Accept': '*/*'
                               },
                               verify=True)
         if check.status_code == 200:
             skater[keys.entity_pic] = pic_url
         skater[keys.entity_team] = self.skating
         community.append(skater)
Exemplo n.º 6
0
    def get_snow(self, html, community, gender, style):
        print 'get_snow:', self.snowboarding
        for tr in html.cssselect('tr.ranking'):
            try:
                player = {}
                player[keys.entity_gender] = gender

                player[keys.entity_style] = style
                player[keys.entity_rank] = parse.csstext(
                    tr.cssselect('td span')[0]).replace('.', '')
                #player[keys.entity_rank_change] = parse.csstext(tr.cssselect('td')[2])
                #player[keys.entity_rank_change] = player[keys.entity_rank_change].replace('--','-')
                player[keys.entity_name] = parse.csstext(tr.cssselect('td')[3])
                player[keys.entity_profile] = fixed.clean_url(
                    self.snow_rankings + tr.cssselect('td')[3].cssselect(
                        'a')[0].attrib['href'].strip().split('?')[0])
                player[keys.entity_origin] = parse.csstext(
                    tr.cssselect('td')[4])
                player[keys.entity_age] = parse.csstext(tr.cssselect('td')[5])
                player[keys.entity_sponsors] = parse.csstext(
                    tr.cssselect('td')[6])
                player[keys.entity_points] = parse.csstext(
                    tr.cssselect('td')[8])
                if keys.entity_profile in player.keys():
                    player[keys.entity_team] = self.snowboarding
                    community.append(player)
            except:
                pass
Exemplo n.º 7
0
    def get_community(self, html, community, gender='Male'):
        trs = html.cssselect(
            'table[class="tableType-athlete hasGroups"]')[0].cssselect('tr')
        print 'community length:', len(trs)
        for tr in trs:
            player = {}
            try:

                player[keys.entity_rank] = parse.csstext(
                    tr.cssselect('td[class~="athlete-tour-rank"]')[0])
                #player[keys.entity_rank_change] = parse.csstext(tr.cssselect('td[class="athlete-tour-rank-change"]')[0])
                name_element = parse.csstext(
                    tr.cssselect('a[class="athlete-name"]')[0]).title()
                player[keys.entity_name] = name_element.replace(
                    'INJU', '').replace('RECO', '').strip()
                player[keys.entity_profile] = fixed.clean_url(
                    'http://www.worldsurfleague.com' +
                    tr.cssselect('a[class="athlete-name"]')[0].attrib['href'])
                player[keys.entity_origin] = tr.cssselect(
                    'span.athlete-country-flag')[0].attrib['title']
                player[keys.entity_points] = parse.csstext(
                    tr.cssselect('span[class="tour-points"]')[0])
                player[keys.entity_prizemoney] = parse.csstext(
                    tr.cssselect('td[class~="athlete-tour-prize-money"]')[0])
                if player[keys.entity_name]:
                    player[keys.entity_team] = self.surfing
                    community.append(player)
            except Exception as e:
                if keys.entity_name in player:
                    player[keys.entity_team] = self.surfing
                    community.append(player)
Exemplo n.º 8
0
 def entrepreneurVC100(self, html):
     components = []
     for h2s in html.cssselect('h2[class="slides"]'):
         player = {keys.entity_team: 'Entrepreneur VC 100'}
         player[keys.entity_rank] = parse.csstext(h2s).split(' ')[0][1:]
         player[keys.entity_name] = h2s.find('./a').text
         player[keys.entity_location] = h2s.find('./a').tail[1:]
         player[keys.entity_profile] = fixed.clean_url(
             'http://' +
             urlparse(h2s.cssselect('a')[0].attrib['href']).netloc)
         if len(player[keys.entity_profile]) > 7:
             player[keys.entity_location] = parse.csstext(h2s).split(
                 ',', 1)[1].strip()
             if h2s.getnext()[0].tag.lower() == 'img':
                 player[keys.entity_pic] = h2s.getnext().cssselect(
                     'img')[0].attrib['src']
                 player[keys.entity_investments] = parse.csstext(
                     h2s.getnext().getnext()).split(' ')[-1] + 'M'
                 try:
                     player[keys.entity_deals] = parse.csstext(
                         h2s.getnext().getnext().getnext())
                 except:
                     pass
             else:
                 player[keys.entity_investments] = parse.csstext(
                     h2s.getnext()).split(' ')[-1] + 'M'
                 player[keys.entity_deals] = parse.csstext(
                     h2s.getnext().getnext()).split(' ')[-1]
         components.append(player)
     return components
Exemplo n.º 9
0
 def getplayers(self, html, team):
     team['players'] = []
     for a in html.cssselect('a.squadPlayerCard'):
         print a
         #print 'hey:', player_span, player_span.cssselect('div.playerPhoto img')[0].attrib
         player = {}
         player[keys.entity_profile] = fixed.clean_url(ipl_base +
                                                       a.attrib['href'])
         #print 'player 1:', player
         player[
             keys.
             entity_pic] = 'http://iplstatic.s3.amazonaws.com/players/210/' + a.cssselect(
                 'div.playerPhoto')[0].cssselect('img[data-player-id]')[
                     0].attrib['data-player-id'] + '.png'
         #print 'player 2:', player
         player[keys.entity_name] = parse.csstext(
             a.cssselect('p.player-name')[0])
         #print 'player 3:', player
         if len(a.cssselect('span.captain')) > 0:
             player[keys.entity_captain] = True
         if len(a.cssselect('span.overseas-player')) > 0:
             player[keys.entity_foreign] = True
         if len(a.cssselect('span.wicket-keeper')) > 0:
             player[keys.entity_position] = "Wicket Keeper"
         for li in a.cssselect('ul.stats li'):
             label = parse.csstext(li.cssselect('span.label')[0])
             value = parse.csstext(li.cssselect('span.value')[0])
             player[label.lower()] = value
         print player
         team['players'].append(player)
     print 'length of team:', len(team['players'])
     return team
Exemplo n.º 10
0
 def createCeoTeam(self, components):
     ceos = []
     for player in components:
         if keys.entity_ceo in player:
             for ceo in player[keys.entity_ceo]:
                 ceo_player = {keys.entity_team: 'Billion Dollar CEO'}
                 ceo_player[keys.entity_company] = player[keys.entity_name]
                 ceo_player.update({keys.entity_name: ceo})
                 print 'lookup:', ceo
                 d = self.cv.google(ceo, domain='en.wikipedia.org')
                 d.addErrback(self.error_league)
                 res = yield d
                 if res and res[0]:
                     print 'wikipedia to profile:', res[0]
                     ceo_player[keys.entity_profile] = fixed.clean_url(
                         res[0])
                     isb = yield self.cv.goto_url(
                         ceo_player[keys.entity_profile]).addCallback(
                             lambda ign: self.cv.to_html()).addCallback(
                                 self.is_born)
                     if isb:
                         ceos.append(ceo_player)
         else:
             print 'NO ceo!', player[keys.entity_name]
     defer.returnValue(ceos)
Exemplo n.º 11
0
    def getplayers(self, html, team):
        print 'getplayers'
        team['team'] = parse.csstext(
            html.cssselect(
                'div[class="cabecera-seccion"] span[class="titulo"]')[0])
        print team['team']
        for tr in html.cssselect(
                'div[class="rotar-tabla margen"] div[id="DataTables_Table_0_wrapper"] table[id="DataTables_Table_0"] tr'
        )[1:]:
            #if parse.csstext(positions) != 'Coach':
            player = {
                keys.entity_position: parse.csstext(tr.cssselect('td')[0])
            }
            a = tr.cssselect('td')[1].cssselect('a')[0]
            player[keys.entity_profile] = fixed.clean_url(a.attrib['href'])
            player[keys.entity_pic] = a.cssselect('img')[0].attrib['src']
            try:
                jersey = parse.csstext(tr.cssselect('td')[2])
                if jersey:
                    player[keys.entity_jersey] = jersey
            except:
                print 'no jersey'
            player[keys.entity_yellows] = parse.csstext(tr.cssselect('td')[15])
            player[keys.entity_reds] = parse.csstext(tr.cssselect('td')[16])
            player[keys.entity_goals] = parse.csstext(tr.cssselect('td')[18])

            team['players'].append(player)
        print[p[keys.entity_profile] for p in team['players']]
Exemplo n.º 12
0
 def pullteams(self, h):
     players=[]
     doc = html.document_fromstring(h) 
     h3 = doc.cssselect('h3 ~ ul li')
     print 'h3 length:', len(h3)
     h2 = doc.cssselect('h2 ~ ul li')
     print 'h2 length:', len(h2)
     h3.extend(h2)
     for li in h3:
         player = {}
         try:
             player[keys.entity_team] = 'PAC'
             player[keys.entity_topic] = parse.csstext(li.getparent().getprevious().getchildren()[0])
             if player[keys.entity_topic] != 'External links':
                 try:
                     href = li.cssselect('a')[0].attrib['href']
                     if not urlparse(href).scheme and href:
                         href = 'http://en.wikipedia.org' + href
                     player[keys.entity_profile] = fixed.clean_url(href)                    
                     player[keys.entity_name] = parse.csstext(li.cssselect('a')[0])
                     if player[keys.entity_name] and player[keys.entity_profile]:
                         if player[keys.entity_name].rfind(' - ') > 0:
                             player[keys.entity_location] = player[keys.entity_name][player[keys.entity_name].rfind(' - ') + 3:]
                             player[keys.entity_name] = player[keys.entity_name][:player[keys.entity_name].rfind(' - ')]
                         #print player
                         players.append(player)
                 except Exception as e2:
                     print 'exception inner:', e2
         except: 
             pass
     players.append({keys.entity_profile: 'team:PAC', keys.entity_twitter: 'FEC'})
     return players
Exemplo n.º 13
0
    def sprintcup_drivers(self, html):
        drivers = []
        for tr in html.cssselect('table.driver-list-table tr')[1:]:
            '''
            <tr>
                <td class="driver-name-td"><a href="/drivers/driversaj-allmendinger/">Allmendinger, AJ</a></td>
                <td class="driver-number-td">47</td>
                <td class="driver-make-td"><img src="/wp-content/uploads/sites/7/2017/01/Chevy-Driver-Page-New-2-160x811-265x180.png"></td>
                <td class="driver-team-td">JTG Daugherty Racing</td>
    
            </tr>
            '''
            driver = {}
            driver[keys.entity_profile] = fixed.clean_url(
                NASCAR.nascar_url +
                tr.cssselect('td.driver-name-td a')[0].attrib['href'])
            driver[keys.entity_name] = parse.csstext(
                tr.cssselect('td.driver-name-td a')[0]).strip()
            driver[keys.
                   entity_name] = driver[keys.entity_name].split(',')[1].strip(
                   ) + ' ' + driver[keys.entity_name].split(',')[0].strip()

            driver[keys.entity_carnumber] = parse.csstext(
                tr.cssselect('td.driver-number-td')[0])
            team = parse.csstext(tr.cssselect('td.driver-team-td')[0]).strip()
            if not team:
                team = NASCAR.unaffiliated
            driver[keys.entity_team] = team

            driver[keys.entity_carnumber] = parse.csstext(
                tr.cssselect('td.driver-number-td')[0])
            driver[keys.entity_circuit] = self.get_common_name()
            print driver
            drivers.append(driver)
            #driver[keys.entity_rank] = parse.csstext(div.cssselect('div.position')[0]).strip()
            #driver[keys.entity_name] = parse.csstext(div.cssselect('div.driver div.driver-first')[0]).split() + ' ' + parse.csstext(div.cssselect('div.driver div.driver-last')[0]).split()

            #<div class="driver"><div class="driver-first"> Martin</div><div class="driver-last">Truex Jr.</div><div class="legend-symbols"></div></div>

            #tr = driver_art.getparent().getparent().getparent().getparent().getparent()

            #driver[keys.entity_points] = parse.csstext(tr.cssselect('td')[3])
            #driver[keys.entity_points_behind] = parse.csstext(tr.cssselect('td')[4]).replace('--','')
            #driver[keys.entity_starts] = parse.csstext(tr.cssselect('td')[5])
            #driver[keys.entity_wins] = parse.csstext(tr.cssselect('td')[6])
            #driver[keys.entity_top5] = parse.csstext(tr.cssselect('td')[7])
            #driver[keys.entity_top10] = parse.csstext(tr.cssselect('td')[8])
            #driver[keys.entity_dnf] = parse.csstext(tr.cssselect('td')[9])

            #if not team:
            #    team = NASCAR.unaffiliated
            #elif 'team:' + team not in [t[keys.entity_profile] for t in drivers]:
            #    drivers.append({ keys.entity_profile: 'team:' + team })

            driver[keys.entity_circuit] = self.get_common_name()
            drivers.append(driver)
        return drivers
Exemplo n.º 14
0
 def entities(self):
     firms = self.firms()
     for firm in firms:
         d = self.cv.bing(firm[keys.entity_name])
         d.addErrback(self.error_league)
         cites = yield d
         if cites:
             firm[keys.entity_profile] = fixed.clean_url(cites[0])
     defer.returnValue([f for f in firms if keys.entity_profile in f])
Exemplo n.º 15
0
 def studio_detail(self, html, studio):
     try:
         info = html.cssselect('table.infobox')[0]
         try:
             studio[keys.entity_name] = parse.csstext(
                 info.cssselect('caption')[0])
         except:
             studio[keys.entity_name] = studio[keys.entity_company]
         try:
             studio[keys.entity_pic] = fixed.clean_url(
                 'http:' + info.cssselect('.logo a img')[0].attrib['src'])
         except:
             pass
         for th in info.cssselect('tr th'):
             if parse.csstext(th) == 'Website':
                 studio[keys.entity_website] = fixed.clean_url(
                     th.getnext().cssselect('a')[0].attrib['href'])
     except:
         pass
     print 'studio:', studio
     return True
Exemplo n.º 16
0
    def gather_active_roster(self, h, team):
        doc = html.document_fromstring(h)
        #/html/body/div[1]/div[3]/div[1]/section/div/section[1]/table
        team[keys.entity_team] = doc.cssselect(
            'meta[property="og:site_name"]')[0].attrib['content']
        for t in doc.xpath('//table[@class="data roster_table"][@summary]'):

            for pt in t.xpath('preceding-sibling::h4'):
                position = pt.text
                if pt.text[-1] == 's':
                    position = pt.text[:-1]
                for player in t.xpath('tbody/tr[position() > 0]'):
                    #print etree.tostring(player)
                    try:
                        player_dict = {}
                        player_dict[keys.entity_position] = position

                        player_dict[keys.entity_profile] = fixed.clean_url(
                            'http://m.mlb.com' + player[2].xpath('a/@href')[0])
                        if player[0].text:
                            player_dict[keys.entity_jersey] = player[0].text
                            if player_dict[keys.entity_jersey] == '42':
                                try:
                                    e = Entity().get_item(
                                        league='mlb',
                                        profile=player_dict[
                                            keys.entity_profile])
                                    player_dict[keys.entity_jersey] = e[
                                        keys.entity_jersey]
                                except:
                                    pass
                        player_dict[keys.entity_name] = player[2].xpath(
                            'a[starts-with(@href, "/player/")]')[0].text
                        try:
                            player_dict[keys.entity_status] = etree.tostring(
                                player[2],
                                method="text").strip().split('<br>')[1]
                            print 'has status:', player_dict[
                                keys.entity_status]
                        except:
                            pass
                        player_dict[keys.entity_height] = player[4].text
                        player_dict[keys.entity_weight] = player[5].text
                        player_dict[keys.entity_born] = player[6].text
                        bt = player[3].text
                        player_dict['bats'] = bt.split("/")[0]
                        player_dict['throws'] = bt.split("/")[1]
                        #print player_dict
                        team['players'].append(player_dict)
                    except Exception as e:
                        print 'player exception:', e
        print 'team:', team['team'], 'players length:', len(team['players'])
        return team
Exemplo n.º 17
0
    def get_company_details(self, company, doc):
        for dd in doc.cssselect('dd'):
            if 'has been closed' in parse.csstext(dd):
                company[keys.entity_closed] = True
        for h2 in doc.cssselect('h2'):
            if parse.csstext(h2) == 'Overview':
                for dt in h2.getparent().getnext().cssselect(
                        'div.definition-list.container dt'):
                    dd = dt.getnext()
                    dt_text = parse.csstext(dt)[:-1]
                    if dt_text.lower() == 'headquarters':
                        company[keys.entity_headquarters] = parse.csstext(dd)
                    elif dt_text.lower() == 'description':
                        company[keys.entity_description] = parse.csstext(dd)
                    elif dt_text.lower() == 'founders':
                        company[keys.entity_founders] = parse.csstext(dd)
                    elif dt_text.lower() == 'categories':
                        company[keys.entity_sector] = parse.csstext(dd)
                    elif dt_text.lower() == 'website ':
                        company[keys.entity_profile] = fixed.clean_url(
                            parse.csstext(dd))

                    elif dt_text.lower() == 'social':
                        for a in dd.cssselect('a[data-icons]'):
                            if a.attrib['data-icons'] == keys.entity_facebook:
                                company[
                                    keys.entity_facebook] = fixed.clean_url(
                                        a.attrib['href']).rsplit('/', 1)[1]
                            if a.attrib['data-icons'] == keys.entity_twitter:
                                company[keys.entity_twitter] = fixed.clean_url(
                                    a.attrib['href']).rsplit('/', 1)[1]
                                if company[keys.entity_twitter].startswith(
                                        '@'):
                                    company[keys.entity_twitter] = company[
                                        keys.entity_twitter][1:]
                            if a.attrib['data-icons'] == keys.entity_linkedin:
                                company[
                                    keys.entity_linkedin] = fixed.clean_url(
                                        a.attrib['href']).replace(
                                            'http://www.linkedin.com/', '')
Exemplo n.º 18
0
 def table(self):
     players = [{
         keys.entity_name: 'Andrew Feldman',
         'twitter': 'AFeldmanESPN',
         keys.entity_profile: fixed.clean_url('http://en.wikipedia.org/wiki/andrew_feldman_(poker_player)')
         }, 
         {
         keys.entity_name: 'The Hendon Mob',
         'twitter': 'thehendonmob',
         keys.entity_profile: fixed.clean_url('http://www.thehendonmob.com')
         },
         {
          keys.entity_name: 'World Poker Tour',
          'twitter': 'WPT',
          keys.entity_profile: fixed.clean_url('http://www.worldpokertour.com')
          },
          {
          keys.entity_name: 'Rio Las Vegas',
          'twitter': 'RioVegas',
          keys.entity_profile: fixed.clean_url('http://en.wikipedia.org/wiki/rio_all_suite_hotel_and_casino')
           },
          {
          keys.entity_name: 'Party Poker',
          'twitter': 'partypoker',
          keys.entity_profile: fixed.clean_url('http://www.partypoker.com/')
           },
          {
          keys.entity_name: 'European Poker Tour',
          'twitter': 'PokerStarsEPT',
          keys.entity_profile: fixed.clean_url('http://www.europeanpokertour.com')
           }]            
     return players
Exemplo n.º 19
0
    def getICOs(self):
        html = yield self.cv.goto_url(
            'https://coinmarketcap.com/all/views/all/').addCallback(
                self.cv.to_html)
        trs = html.cssselect(
            'div.table-responsive.compact-name-column div.dataTables_wrapper.no-footer table tr'
        )
        icos = []
        for tr in trs[1:][:1200]:

            name = parse.csstext(tr.cssselect('a.currency-name-container')[0])
            rank = parse.csstext(tr[0])
            symbol = parse.csstext(tr.cssselect('td.col-symbol')[0])

            try:
                href = tr.cssselect('span.currency-symbol a')[0].attrib['href']
                profile = fixed.clean_url('http://coinmarketcap.com' + href)

                print 'rank:', rank, 'name:', name, 'sybol:', symbol
                ico = {
                    keys.entity_name: name,
                    keys.entity_profile: profile,
                    keys_market.symbol: symbol,
                    keys.entity_rank: rank
                }

                try:
                    market_cap = twitter_keys.numTwitter(
                        int(
                            parse.csstext(
                                tr.cssselect('td.no-wrap.market-cap.text-right'
                                             )[0]).replace('$', '').replace(
                                                 ',', '').strip()))
                    ico[keys.entity_market_cap] = market_cap
                except:
                    pass
                try:
                    supply = twitter_keys.numTwitter(
                        int(
                            parse.csstext(
                                tr.cssselect(
                                    'td.no-wrap.text-right.circulating-supply')
                                [0]).replace('*', '').replace(',',
                                                              '').strip()))
                    ico[keys.entity_circulating_supply] = supply
                except:
                    pass

                icos.append(ico)
            except:
                pass
        defer.returnValue(icos)
Exemplo n.º 20
0
 def get_2010s(self, html):
     performers = []
     for li in html.cssselect('h2 span[id="2010s"]')[0].getparent().getnext(
     ).cssselect('ul li'):
         a = li.cssselect('a')[0]
         star = {
             keys.entity_profile:
             fixed.clean_url('http://en.wikipedia.org' + a.attrib['href']),
             keys.entity_name:
             a.attrib['title']
         }
         performers.append(star)
     return performers
Exemplo n.º 21
0
 def callbackExtractSenate(self, h):
     senators = []
     doc = html.document_fromstring(h)
     try:
         trs = doc.cssselect('h2 span[id="Senators"]')[0].getparent()
         while trs.tag != 'table':
             trs = trs.getnext()                        
         trs = trs.cssselect('tr')
         state = None
         party = None
         for i, tr in enumerate(trs[1:]):
             if i % 2 == 0:
                 offset = 1
                 if len(tr.cssselect('td')) == 9:
                     party = self.get_party(tr)
                 if len(tr.cssselect('td')) == 8:
                     offset = 0
                 state = tr[offset][0].text
             else:
                 if len(tr.cssselect('td')) == 8:
                     party = self.get_party(tr)
                 offset = 0
                 if len(tr.cssselect('td')) == 7:
                     offset = -1
             senator = {}
             senator[keys.entity_team] = 'US Senate'
             senator[keys.entity_state] = state
             
             senator[keys.entity_pic] = fixed.clean_url('http:' + tr[1 + offset].cssselect('img')[0].attrib['src'])
             senator[keys.entity_name] = tr[2 + offset].cssselect('span.vcard a')[0].attrib['title']
             senator[keys.entity_profile] = fixed.clean_url('http://en.wikipedia.org' + tr[2 + offset].cssselect('span.fn a')[0].attrib['href'])
             senator[keys.entity_party] = party
             senator[keys.entity_born] = parse.csstext(tr[3 + offset].cssselect('span.bday')[0])
             senator[keys.entity_term_expires] = parse.csstext(tr[7 + offset]).split(' ')[-1]
             senators.append(senator)
     except Exception as e:
         print 'senate exception:', e
     senators.append({keys.entity_twitter: 'SenateHistory', keys.entity_profile: 'team:US Senate' })
     return senators
Exemplo n.º 22
0
 def thefundedTopRatedVCs(self, html):
     components = []
     for rank in html.cssselect('div[id="post"]')[0].cssselect(
             'p[class="larger red"]'):
         player = {keys.entity_team: 'TheFunded Top Partners'}
         player[keys.entity_rank] = parse.csstext(rank)[:-1]
         player[keys.entity_name] = parse.csstext(
             rank.getnext().cssselect('a')[0])
         player[keys.entity_profile] = fixed.clean_url(
             'http://www.thefunded.com' +
             rank.getnext().cssselect('a')[0].attrib['href'])
         player[keys.entity_firm] = parse.csstext(
             rank.getparent().cssselect('a[class="fund"]')[0])
         print 'player:', player
         components.append(player)
     return components
Exemplo n.º 23
0
    def get_community(self, html, community):
        subteams = {}
        for cycling_team in html.cssselect('.team_box')[0].cssselect('ul li'):
            jersey_pic = cycling_team.cssselect('a img')[0].attrib['src']
            thref = 'http://www.cyclingnews.com' + cycling_team.cssselect(
                'a')[0].attrib['href']
            print 'team url:', thref
            subteam = {}
            subteam[keys.entity_jersey_pic] = jersey_pic
            subteams[thref] = subteam
        for k, st in subteams.iteritems():
            d = self.cv.goto_url(k)
            d.addCallback(self.cv.to_html)
            d.addErrback(self.error_league)
            subhtml = yield d
            team_name = parse.csstext(
                subhtml.cssselect('div[class="team-name"]')[0])
            st[keys.entity_profile] = 'team:' + team_name
            print 'cycle team:', st

            for rider in subhtml.cssselect('div.riders div.rider'):
                player = {}
                player[keys.entity_team] = team_name
                player[keys.entity_name] = parse.csstext(
                    rider.cssselect('a')[0])
                player[keys.entity_profile] = fixed.clean_url(
                    'http://www.cyclingnews.com' +
                    rider.cssselect('a')[0].attrib['href'])
                #print 'found one!:', player
                community.append(player)
        for p in community:
            d = self.cv.goto_url(p[keys.entity_profile] + "/")
            d.addCallback(self.cv.to_html)
            d.addErrback(self.error_league)
            riderhtml = yield d
            try:
                rider = riderhtml.cssselect('rider-info-boxout')[0]
                p[keys.entity_pic] = rider.cssselect(
                    'img.rider-image')[0].attrib['src']
                p[keys.entity_dob] = parse.csstext(
                    rider.cssselect('span[itemprop="birthDate')[0])
                p[keys.entity_nationality] = parse.csstext(
                    rider.cssselect('span[itemprop="nationality')[0])
            except:
                pass
        community.extend(subteams.values())
        defer.returnValue(community)
Exemplo n.º 24
0
 def adjustments(self, components):
     print 'adjustments components len:', len(components)
     for c in components:
         cites = yield self.cv.bing(c[keys.entity_name])
         if cites[0]:
             from urlparse import urlparse
             c[keys.entity_profile] = fixed.clean_url(
                 'http://' + urlparse(fixed.simpleurl(cites[0])).netloc)
             print c[keys.entity_name], 'bing profile:', c[
                 keys.entity_profile]
             for key in wbd:
                 if key[0] == c[keys.entity_name] and c[
                         keys.entity_profile] != key[1]:
                     c[keys.entity_profile] = key[1]
                     print '        ', c[keys.entity_profile]
             print 'profile:', c[keys.entity_profile], c[keys.entity_name]
     defer.returnValue([c for c in components if keys.entity_profile in c])
Exemplo n.º 25
0
 def process_team(self, doc, team):
     print 'process team'
     team['players'] = []
     for section in doc.cssselect('section.row.nba-player-index__row'):            
         for p in section.cssselect('section.nba-player-index__trending-item'):
             player = {}
             player[keys.entity_jersey] = parse.csstext(p.cssselect('span.nba-player-trending-item__number')[0])
             anchor = p.cssselect('a')[0]
             player[keys.entity_name] = anchor.attrib['title']
             player[keys.entity_profile] = fixed.clean_url(NBA.nba_url + anchor.attrib['href'])
             player[keys.entity_pic] = 'http:' + anchor.cssselect('div.nba-player-index__image div.nba-player-index__headshot_wrapper img')[0].attrib['data-src']
             player[keys.entity_position] = parse.csstext(p.cssselect('div.nba-player-index__details span')[0])
             player[keys.entity_height] = parse.csstext(p.cssselect('div.nba-player-index__details strong')[0]).split(' ')[0] + '\' ' + parse.csstext(p.cssselect('div.nba-player-index__details strong')[1]).split(' ')[0] + '\"'
             player[keys.entity_weight] = parse.csstext(p.cssselect('div.nba-player-index__details strong')[2])
             team['players'].append(player)
     print 'team:', team['team'], 'players length:', len(team['players'])
     return team
Exemplo n.º 26
0
 def extract_players(self, doc, team):
     try:
         n = doc.xpath('//div[@id="searchResultsLargeTable"]//tbody[1]')[0]
         for a in n:
             player_data = {}
             player_data[keys.entity_name] = a[2][0].text
             player_data[keys.entity_profile] = fixed.clean_url('http://www.nfl.com' + a[2][0].attrib['href'])
             player_data[keys.entity_position] = a[0].text
             player_data[keys.entity_status] = a[3].text
             try:
                 if a[1].text:
                     player_data[keys.entity_jersey] = a[1].text
             except Exception as e:
                 print 'player exception:', e, team['team'] 
             team['players'].append(player_data)
     except Exception as e2:
         print 'team exception:', e2, team['team'] 
Exemplo n.º 27
0
    def scrape_page(self, html, team):
        for li in html.cssselect('li[data-pos]'):
            ranking = li.attrib['data-pos']
            celebrity_handle = li.cssselect(
                'div[class="clr"] div[class="name-bio"] a[class="uname"]'
            )[0].attrib['href'][1:]
            name = parse.csstext(
                li.cssselect(
                    'div[class="clr"] div[class="name-bio"] a[class="name"] span'
                )[0])
            partial = {keys.entity_rank: ranking, keys.entity_name: name}
            existing_league = None
            for count_e in Entity().query_2(index=Entity.index_twitter_league,
                                            twitter__eq=celebrity_handle):
                if count_e[keys.entity_league] != 'celebrity':
                    existing_league = count_e[keys.entity_league]
            if not existing_league:
                partial[keys.entity_twitter] = celebrity_handle
            else:
                print 'already in league:', existing_league
            cite = yield self.cv.google(name + ' wikipedia',
                                        results=1,
                                        domain='en.wikipedia.org')
            try:
                clean_cite = self.check_cite(fixed.clean_url(cite[0]))

                print name, clean_cite, ranking
                html = yield getPage(str(clean_cite)).addCallback(etree.HTML)
                self.is_born(html, partial, clean_cite)
                if keys.entity_profile in partial.keys():
                    dob = ''
                    try:
                        dob = partial[keys.entity_dob]
                    except:
                        pass
                    print '{:5s}'.format(
                        partial[keys.entity_rank]), '{:40s}'.format(
                            partial[keys.entity_name]), '{:20s}'.format(
                                celebrity_handle), dob

                    team.append(partial)
                else:
                    print '    not born:', ranking, '{:40s}'.format(
                        'https://twitter.com/' + celebrity_handle), name
            except:
                print 'cite exception:', ranking, celebrity_handle, name
Exemplo n.º 28
0
 def callbackDepartments(self, h):
     departments = []
     doc = html.document_fromstring(h)
     for a in doc.cssselect('li a, h3 a'):
         department_name = parse.csstext(a).split('(')[0]
         for k in self.sw:
             if department_name.startswith(k):
                 department_name = department_name[len(k):]
                 department_name = department_name.strip()
         department_name = department_name.strip()
         department_url = fixed.clean_url(a.attrib['href'])
         if department_url not in self.skip: 
             department = { keys.entity_name: department_name, keys.entity_profile: department_url}
             department[keys.entity_team] = 'Departments'
             departments.append(department)
     departments.append({ keys.entity_profile: 'team:Departments', keys.entity_twitter: 'USGAO', keys.entity_name: 'Oversight Committee'})
     return departments
Exemplo n.º 29
0
 def callbackExtractHouse(self, h):
     representatives = []
     doc = html.document_fromstring(h)
     table = doc.cssselect('h2 span[id="Voting_members_by_state"]')[0].getparent().getnext()
     trs = table.cssselect('tr')
     for tr in trs[1:]:
         try:
             congress = {}
             congress[keys.entity_team] = 'House of Representatives'
             s = parse.csstext(tr[0].cssselect("a")[0]).split(" ")[:-1]
             try:
                 s.remove(' at')
                 s.remove('At')
             except:
                 pass
             congress[keys.entity_state] = ' '.join(s)
             if congress[keys.entity_state].endswith(' at'):
                 congress[keys.entity_state] = congress[keys.entity_state][:-3]
             try:
                 congress[keys.entity_pic] = 'http:' + tr[1].cssselect("a img")[0].attrib['src']
             except:
                 pass
             congress[keys.entity_name] = tr[1].cssselect('span.vcard a')[0].text
             congress[keys.entity_profile] = fixed.clean_url('http://en.wikipedia.org' + tr[1].cssselect('span.vcard a')[0].attrib['href'])
             if len(tr) == 9:
                 congress[keys.entity_party] = parse.csstext(tr[3])
             elif len(tr) == 7:
                 congress[keys.entity_party] = representatives[-1][keys.entity_party]
             congress[keys.entity_prior_exp] = parse.csstext(tr[-5])
             congress[keys.entity_college] = parse.csstext(tr[-4])
             try:
                 ao = tr[-3].text
                 if '*' in ao:
                     ao = ao.replace('*', '')
                 congress[keys.entity_assumed_office] = ao.strip()
             except:
                 pass
             congress[keys.entity_born] = parse.csstext(tr[-1]).strip()
             
             representatives.append(congress)
             
         except:
             pass
     representatives.append({keys.entity_twitter: 'USHouseHistory', keys.entity_profile: 'team:House of Representatives' })        
     return representatives
Exemplo n.º 30
0
 def is_born(self, html, maybeperson, url):
     try:
         maybeperson[keys.entity_name] = parse.csstext(
             html.cssselect(
                 'table[class="infobox biography vcard"] tr th span')[0])
     except:
         maybeperson[keys.entity_name] = parse.csstext(
             html.cssselect('h1[id="firstHeading"][class="firstHeading"]')
             [0])
     for th in html.cssselect('th'):
         if parse.csstext(th).lower() in ['born', 'date of birth']:
             try:
                 maybeperson[keys.entity_dob] = parse.csstext(
                     th.getparent().cssselect(
                         'span[class="bday"]')[0]).replace(')', '')
             except:
                 pass
             maybeperson[keys.entity_profile] = fixed.clean_url(url)