def get_table_data(self): self.data = [] etree = self.tree.xpath( '//div[@data-type="league-table-container"]/div[@data-type="table"]/div[@data-type="table-row"]' ) for element in etree: name = sc.get_text(element.xpath('./div[@class="team"]/text()')) place = sc.get_text( element.xpath('.//span[@data-type="rank"]/text()')) played = sc.get_text( element.xpath('.//div[@data-type="played"]/text()')) wins = sc.get_text( element.xpath('.//div[@data-type="wins"]/text()')) draws = sc.get_text( element.xpath('./div[@data-type="draws"]/text()')) losses = sc.get_text( element.xpath('./div[@data-type="losses"]/text()')) scored = sc.get_text( element.xpath('./div[@data-type="goals-scored"]/text()')) received = sc.get_text( element.xpath('./div[@data-type="goals-received"]/text()')) difference = sc.get_text( element.xpath('./div[@data-type="goal-difference"]/text()')) points = sc.get_text( element.xpath('./div[@data-type="points"]/text()')) print(name, place, played, wins, draws, losses, scored, received, difference, points) self.data.append([ name, place, played, wins, draws, losses, scored, received, difference, points ]) sc.Database(('name', 'place', 'played', 'wins', 'draws', 'losses', 'scored', 'received', 'difference', 'points'), 'table.db').database(self.data)
def collect_info(product_url): product_html = scraper.content(product_url) info = { 'name': scraper.get_text(product_html, 'div#variant-info h1').pop(), 'sku': scraper.get_text(product_html, 'div.commerce-product-sku span').pop(), 'price': scraper.get_text(product_html, 'p.field-type-commerce-price').pop() } return info
def get_data(self): for number in range(1, 17): self.source = requests.get(self.url + str(number) + '/') self.tree = lxml.html.fromstring(self.source.content) self.etree = self.tree.xpath( '//div[@id="departement-content"]/a[@class]') for element in self.etree: self.location = sc.get_text( element.xpath('./span[@class="ville"]/text()')) self.name = sc.get_text( element.xpath('./span[@class="nom"]/text()')) self.description = sc.get_text( element.xpath('./span[@class="intitule"]/text()')) self.data.append([self.location, self.name, self.description]) sc.Database(('location', 'name', 'description')).push_data(self.data)
def get_data(self): for number in range(1, 247): print(number) self.url = self.get_url(number) self.source = requests.get(self.url) self.etree = lxml.html.fromstring(self.source.content) self.tree = self.etree.xpath('//div[@id="listingsResults"]//tr[not(@id)]') for element in self.tree: self.job = sc.get_href(element.xpath('.//div[@class="listing-title"]/a[@href]')) self.location = sc.get_text(element.xpath('.//div[@class="left-side"]/span[1]/following-sibling::span[1]/text()')) self.posted = sc.get_text(element.xpath('.//div[@class="left-side"]/span[2]/following-sibling::span[2]/text()')) self.link = sc.get_href(element.xpath('.//div[@class="left-side"]/span[3]/following-sibling::span[3]/a[@href]')) self.company = sc.get_text(element.xpath('.//div[@class="left-side"]/span[3]/following-sibling::span[3]//text()')) self.data.append([self.job, self.location, self.posted, self.link, self.company]) sc.Database(('job', 'location', 'posted', 'link', 'company')).push_data(self.data)
def get_matches_data(self): self.data = [] mount = int( self.tree.xpath( 'count(//div[@data-type="container"]/div[@data-type="evt"])')) for i in range(1, mount + 1): element = self.tree.xpath( '//div[@data-type="container"]//div[@data-type="evt"][' + str(i) + ']')[0] date = sc.get_text( element.xpath( './/preceding::div[@class="right fs11"][1]/text()')) time = sc.get_text( element.xpath('.//div[@class="min "]/span/text()')) home_name = sc.get_text( element.xpath('.//div[@class="ply tright name"]/span/text()')) away_name = sc.get_text( element.xpath('.//div[@class="ply name"]/span/text()')) score_home = sc.get_text( element.xpath('.//span[@class="hom"]/text()')) score_away = sc.get_text( element.xpath('.//span[@class="awy"]/text()')) print(date, time, home_name, score_home, score_away, away_name) self.data.append( [date, time, home_name, score_home, away_name, score_away]) sc.Database(('data', 'time', 'home_name', 'score_home', 'away_name', 'score_away'), 'matches.db').database(self.data)
def write(Bing_Results, Google_Results, Queries, n): results = open('results2.txt', 'a') results.write('Query Kendall Jaccard B_Sub B_Pol B_pos B_neg B_Class G_Sub G_Pol G_pos G_neg G_Class\n') kd, jc = 0, 0 bs, bp, bpos, bneg, bcl = 0, 0, 0, 0, 0 gs, gp, gpos, gneg, gcl = 0, 0, 0, 0, 0 # Missed: 13, 14, 15, 16, 17, 18, 19 for query in Queries[42:50]: query = query[:-1] curr_kd = kendall.Kendall(Bing_Results[query], Google_Results[query]) curr_jc = jaccard.Jaccard(Bing_Results[query], Google_Results[query]) curr_bs = subjectivity.Subjectivity([scraper.get_text(x) for x in Bing_Results[query]]) curr_gs = subjectivity.Subjectivity([scraper.get_text(x) for x in Google_Results[query]]) print(query) kd += curr_kd jc += curr_jc bs += curr_bs[0] bp += curr_bs[1] bpos += curr_bs[2] bneg += curr_bs[3] bcl += 1 if curr_bs[4] == 'pos' else 0 gs += curr_gs[0] gp += curr_gs[1] gpos += curr_gs[2] gneg += curr_gs[3] gcl += 1 if curr_gs[4] == 'pos' else 0 msg = (query + ' ' + str(round(curr_kd, 4)) + ' ' + str(round(curr_jc, 4)) + ' ' + str(round(curr_bs[0], 4)) + ' ' + str(round(curr_bs[1], 4)) + ' ' + str(round(curr_bs[2], 4)) + ' ' + str(round(curr_bs[3], 4)) + ' ' + str(curr_bs[4]) + ' ' + str(round(curr_gs[0], 4)) + ' ' + str(round(curr_gs[1], 4)) + ' ' + str(round(curr_gs[2], 4)) + ' ' + str(round(curr_gs[3], 4)) + ' ' + str(curr_gs[4]) + '\n') results.write(msg) msg = (f'Average {round(kd/n, 4)} {round(jc/n, 4)} {round(bs/n, 4)} {round(bp/n, 4)} {round(bpos/n, 4)} {round(bneg/n, 4)}' + f' {round(bcl/n, 4)} {round(gs/n, 4)} {round(gp/n, 4)} {round(gpos/n, 4)} {round(gneg/n, 4)} {round(gcl/n, 4)}') results.write(msg)
def get_data(self, words): words = words.split('\n') for word in words: self.data[word] = [] try: self.source = requests.get(self.url+word) except requests.exceptions.ConnectionError: FindWords.create_connection_error_label() self.tree = lxml.html.fromstring(self.source.content) for option in self.options: self.etree = self.tree.xpath(option) for element in self.etree: self.translation = sc.get_text(element.xpath('./text()')) self.data[word].append(self.translation)
def is_url_relevant(url): """Check if a url is relevant This function will check if a url is classified as relevant and if it has relevent keyword. """ driver = create_driver() driver.get(url) print("Checking:", url) urltext = scraper.get_text(driver.page_source) value = classifier.predict([urltext]) keyword_relevance = isKeywordPresent(urltext) relevant = value and keyword_relevance print("Relavent: ", relevant) return relevant
def get_data(self): self.etree = self.tree.xpath( '//table[@class="table"]//tr[following-sibling::tr]') self.name = sc.get_text( self.tree.xpath( '//div[@class="row bottom-margin-1x"][1]/div[1]/h1/text()[4]')) for element in self.etree: self.date = sc.get_text(element.xpath('./td[1]/text()')) self.open = sc.get_text(element.xpath('./td[2]/text()')) self.high = sc.get_text(element.xpath('./td[3]/text()')) self.low = sc.get_text(element.xpath('./td[4]/text()')) self.close = sc.get_text(element.xpath('./td[5]/text()')) self.volume = sc.get_text(element.xpath('./td[6]/text()')) self.market_cap = sc.get_text(element.xpath('./td[7]/text()')) self.list = [ self.date, self.open, self.high, self.low, self.close, self.volume, self.market_cap ] self.data.append(self.list) sc.Database( ('date', 'open', 'high', 'low', 'close', 'volume', 'market_cap'), file_name=self.name + '.db').push_data(self.data)
def calc_key_count(url): """Count key terms in url This function will count the number of key terms for each url by analyzing its web page content. """ lem = WordNetLemmatizer() driver = create_driver() driver.get(url) keywords = keywordFetcher.fetchKeyTerms() urltext = scraper.get_text(driver.page_source) urltext = urltext.split() urltext = [lem.lemmatize(plural).lower() for plural in urltext] key_count = 0 for i in range(len(keywords)): key_count += urltext.count(keywords[i]) print(key_count) return key_count