def calculate_concat(question_text, answer): query_url = define_url(question_text, answer.get_text()) google_results, full_page = search(query_url, full_page=True) # Extract the number of total google results answer.total_results = get_google_total_results(full_page) for result in google_results: result_text = PyQuery(result).text() # If Google doesn't find enough results, it includes some that aren't really relevant, # adding "Missing words: <keywords>", where keywords are words # included in the search query (answer, here). # In this context, the results described are not useful and are excluded # If the answer is in the result text # If "Mancanti:" is not in the result text (so, it's a relevant result) # If the answer is not in the "Must include" section if answer.get_text() in result_text.lower() and \ not "Mancanti:" in result_text and \ not answer.get_text() in result_text.split("\n")[-1].lower(): # Yay! This is a relevant result! answer.results += 1 # Calculate the score of the answer answer.score = answer.total_results * (answer.results if answer.results > 0 else 1) return f"{answer.get_text()[:40]:^40}{answer.score:<10}{answer.results:^10}{answer.total_results:<10}"
def __init__(self, elem, trims, should_cleanup): text = PyQuery(elem).text() for trim in (trims or []): text = text.replace(trim, '') self.rx = re.compile(r'\W+') self.text = text.strip() self.trimmed_text = non_trimmed.sub(' ', self.text) self.html = PyQuery(elem).html() if should_cleanup: self.html = self.cleanup_html() self.normalized_text = nonword.sub('', text.lower())
def process(url, handle): source = PyQuery(url=url) siteList = source.find("li.site-listing").find("a") content = "" for data in siteList: domain = PyQuery(data).text() if domain.lower() == "more": continue else: #print domain content += domain + "\n" handle.write(content)
def fetch_events(): # some constants for this scrape playerID = CONFIG['playerID'] team = CONFIG['team'] # use Honolulu time zone so we scrape the right date # even during weird baseball today = arrow.now('Pacific/Honolulu').date() r = create_redis_connection() # scrape the MLB game list page game_day_url = DATA_ROOT + 'year_{0}/month_{1}/day_{2}/'.format(today.year, '{:02d}'.format(today.month), '{:02d}'.format(today.day)) page = PQ(game_day_url) # find the links on page game_links = [PQ(link).attr('href') for link in page('li a')] # we only care about game data links for player's team game_links = [link.strip('/') for link in game_links if 'gid' in link and team.lower() in link] # iterate through team's games for the day for gameID in game_links: # get the player's batter data file for this game data_url = game_day_url + gameID data_url += "/batters/{0}.xml".format(playerID) page = PQ(data_url) # just the at-bat events please atbats = page('atbats ab') # iterate through player's at-bats for index, event in enumerate(atbats): atbat = index+1 # see if we've seen this at-bat rkey = "{0}-{1}-AB{2}".format(gameID, playerID, atbat) stored = r.get(rkey) # store results of new at-bats so we only # match against events we haven't seen if not stored: result = PQ(event).attr("event") match = result.lower() in [event.lower() for event in CONFIG['events_tracked']] r.set(rkey, result) # if we match, do a thing if match: handle_match(result) else: handle_miss(result) print 'Done with scrape.'
def parse_html(filename, pixel='720', sub='', download='magnet'): with open(filename, encoding='utf-8') as f: html_raw = f.read() html = PQ(html_raw) tr_all = html.find('#seedlist').find('tr') tr_selected = [] a_selected = [] for tr in tr_all: tr = PQ(tr) title = PQ(tr.find('a')[0]).text() if pixel in title and sub.lower() in title.lower(): tr_selected.append(title) for a in tr.find('a'): href = a.attrib['href'] if download in href: a_selected.append(href) break print('total:', len(tr_selected), 'find:', len(a_selected)) # pprint(tr_selected) print('\n'.join(a_selected))