def add_website(self, website_meta): '''Add the website in the database''' #Create a website object to check if it exists in the database web = Website.objects( name=website_meta.get("name"), homepage_url=website_meta.get("homepage_url") ).first() if web: return web #This website object is used to add to the database web = Website( name=website_meta.get("name"), homepage_url=website_meta.get("homepage_url") ) try: status = web.save() except ValidationError: self.logger.warn('Save/Validate Website Failed! url: {0}'\ .format(website_meta.get("homepage_url"))) if status: return web else: return None
def main(): if len(sys.argv) != 2: print("Invalid command line arguments.") print("Usage: python3 diagnose.py <WEBSITE_URL>") exit() url = sys.argv[1] FILEPATH_PREFIX = "data/" FILEPATH_TEXT_SUFFIX_CLEAN = "_clean.txt" FILEPATH_TEXT_SUFFIX_BLOCK = "_block.txt" FILEPATH_IMAGE_SUFFIX_CLEAN = "_clean.png" FILEPATH_IMAGE_SUFFIX_BLOCK = "_block.png" txt_clean = FILEPATH_PREFIX + url + FILEPATH_TEXT_SUFFIX_CLEAN txt_block = FILEPATH_PREFIX + url + FILEPATH_IMAGE_SUFFIX_BLOCK img_clean = FILEPATH_PREFIX + url + FILEPATH_IMAGE_SUFFIX_CLEAN img_block = FILEPATH_PREFIX + url + FILEPATH_IMAGE_SUFFIX_BLOCK website_clean = Website(txt_clean, img_clean, "clean") website_block = Website(txt_block, img_block, "block") pair = WebsitePair(website_clean, website_block) # @TODO This if-else section can be expanded as future faults become detectable. if page_is_blank(website_clean, website_block, pair): print("Page is blank!") else: print("No faults detected.")
def startup(self): """ Some stuff that should get called after everything is loaded. """ self.env.seishub.startup() self.nw_tree.startup() # Connect some slots. QtCore.QObject.connect(self.nw_tree.nw_select_model, QtCore.SIGNAL("selectionChanged(QItemSelection, QItemSelection)"), \ self.waveforms.waveform_scene.add_channel) web = Website(env=self.env) web.startup() # Add a WebView to later display the map. file = open(os.path.join(self.env.temp_res_dir, 'map.html')) html = file.read() file.close() self.env.web.setHtml(html) self.picks.update() css_url = QtCore.QUrl.fromLocalFile(os.path.abspath(self.env.css)) server = '%s/manage/seismology/stations' % self.env.seishub_server url = QtCore.QUrl(server) url.setUserName(self.env.seishub_user) url.setPassword(self.env.seishub_password) # Might work with some Qt version... self.env.station_browser.page().settings().setUserStyleSheetUrl(css_url) self.env.station_browser.load(url) self.env.station_browser.page().settings().setUserStyleSheetUrl(css_url)
def setUp(self): self.single_plan = Plan('Single', 49, 1) self.plus_plan = Plan('Plus', 99, 3) self.website_1 = Website('https://google.com') self.website_2 = Website('https://google.com') self.customer_1 = Customer('customer_1', '123456789', '*****@*****.**')
def __init__(self, name, url, internalLinkPattern, targetPattern, titleSelector, priceSelector): Website.__init__(self, name, url, internalLinkPattern, targetPattern, titleSelector) self.products = [] self.termsToExclude = [] self.priceSelector = priceSelector
def __init__(self, user): Website.__init__(self, 'JD', user) self.login_page = LoginPage(self) self.activ_page = ActiPage(self) self.list_page = ListPage(self) self.main_page = MainPage(self) self.coupon_page = CouponPage(self) self.data_page = DataPage(self)
def __init__(self, user): Website.__init__(self, 'JD_mobile', user) self.login_page = LoginPage(self) self.main_page = MainPage(self) self.data_page = DataPage(self) self.charge_page = ChargePage(self) self.get_coupon_page = GetCouponPage(self) self.json_page = JsonPage(self)
def serializeWebsite(self, website): if ("sitemap" in website.keys()): return Website(website['homepage'], website['input_dict'], website['lastmod'], sitemap=website['sitemap']) else: return Website(website['homepage'], website['input_dict'], website['lastmod'])
def home(): if request.method == 'POST': Website(request.form['url']) Website.check_all() return render_template( "home.html", pages=Website.all, length=len(Website.all) )
def test_fix_link(link, hostname, scheme, result): mock_parsed_url = Mock() mock_parsed_url.hostname = hostname mock_parsed_url.scheme = scheme mock_parsed_url.netloc = hostname website = Website('seed_url') assert website.fix_link(link, mock_parsed_url) == ( result, hostname )
def test_scrape(monkeypatch, page_content, links, to_visit): mock_response = Mock() mock_response.text = page_content mock_response.status_code = 200 monkeypatch.setattr('website.requests.get', lambda x: mock_response) website = Website('http://hostname/url') website.scrape() # pages are 'http://hostname/url', 'http://hostname/new-url', # 'https://hostname/', 'http://hostname/', 'https://hostname/new-url' assert len(website.pages) == 5
def test_scrape_url(monkeypatch, page_content, links, to_visit): mock_response = Mock() mock_response.text = page_content mock_response.status_code = 200 monkeypatch.setattr('website.requests.get', lambda x: mock_response) website = Website('http://hostname/url') # Simulate visiting the page. url, _ = website.to_visit.popitem() website.scrape_url(url) assert website.to_visit == OrderedDict((key, None) for key in to_visit) assert website.pages[url].links == links
def __init__(self, url, keywords=None, searchPageLimit=2, websitesJsonFile="websites.json", isInitialCrawl=False): # iinitialize class attributes self.baseUrl = url self.keywords = keywords self.articleLinks = [] self.articleCount = 0 self.searchPageLimit = searchPageLimit self.websitesJsonFile = websitesJsonFile self.isInitialCrawl = isInitialCrawl # instantiate a Website object to interact with the website to be crawled try: self.website = Website(url, websitesJsonFile=self.websitesJsonFile) # raise exception if there is an error connecting to the website except WebsiteFailedToInitialize: raise WebsiteFailedToInitialize(url) # open the json file containing websites and their attributes with open(self.websitesJsonFile) as data_file: self.websites = json.load(data_file) data_file.close() # set the searchQuery attribute to the appropriate search query structure in the websites json file for website, attributes in self.websites.items(): if website in self.baseUrl: self.searchQuery = attributes["searchQuery"] self.nextPageType = attributes["nextPage"] # populate the exceptions attribute list with websites who's article urls need to be manually # crawled self.exceptions = [ "https://www.ourmidland.com/", "https://www.lakecountystar.com/", "https://www.northernexpress.com/", "https://www.manisteenews.com/" ] print("\r" + bcolors.OKGREEN + "[+]" + bcolors.ENDC + " Crawling " + self.baseUrl + "..." + bcolors.ENDC, end="") sys.stdout.flush() # start crawling self.crawl() print("\r" + bcolors.OKGREEN + "[+]" + bcolors.ENDC + " Crawled " + self.baseUrl + ": " + bcolors.OKGREEN + str(len(self.articleLinks)) + " URLs retrieved" + bcolors.ENDC)
def test_find_links(page_content, hostname, scheme, links, to_visit): mock_parsed_url = Mock() mock_parsed_url.hostname = hostname mock_parsed_url.scheme = scheme mock_parsed_url.netloc = hostname website = Website('http://hostname/url') # Simulate visiting the page. website.to_visit.popitem() page = Page('a_url') bs = BeautifulSoup(page_content, 'html.parser') website.find_links(page, bs, mock_parsed_url) assert page.links == links assert website.to_visit == OrderedDict((key, None) for key in to_visit)
def process_single_website(website_url): """Processes a single website and exports to csv string. """ txt_clean = FILEPATH_PREFIX + website_url + FILEPATH_TEXT_SUFFIX_CLEAN txt_block = FILEPATH_PREFIX + website_url + FILEPATH_IMAGE_SUFFIX_BLOCK img_clean = FILEPATH_PREFIX + website_url + FILEPATH_IMAGE_SUFFIX_CLEAN img_block = FILEPATH_PREFIX + website_url + FILEPATH_IMAGE_SUFFIX_BLOCK website_clean = Website(txt_clean, img_clean, "clean") website_block = Website(txt_block, img_block, "block") pair = WebsitePair(website_clean, website_block) print(get_csv_header(website_clean, website_block, pair))
def run_website(): website = Website() @website.route('/') def index(): return 200, 'users list' @website.route('/users/([0-9]+)') def user(user_id): if user_id not in ['1', '2']: return 404, '' return 200, f'user {user_id}' website.run(_ADDRESS)
def __init__(self): self.w = Website() self.root = Tk() self.root.title("Auto site - Enter the fields") self.my_font = tkFont.Font(family="Helvetica", size=11) self.frame = Frame(self.root, height=800, width=800, \ padx=50, pady=10) self.frame.pack() self.fields() self.buttons() self.root.mainloop()
def main(): # Initialize different plans single_plan = Plan('Single', 49, 1) plus_plan = Plan('Plus', 99, 3) infinite_plan = Plan('Infinite', 249, -1) # Initialize multiple websites website_1 = Website('https://website_1.com') website_2 = Website('https://website_2.com') website_3 = Website('https://website_3.com') website_4 = Website('https://website_4.com') # Initialize multiple customers customer_1 = Customer('customer_1', '123456789', '*****@*****.**') customer_2 = Customer('customer_2', '123456789', '*****@*****.**') customer_3 = Customer('customer_3', '123456789', '*****@*****.**') # customer_1 subscribed for single_plan customer_1.add_subscription(single_plan) print("{} has subscribed for {} plan".format(customer_1, customer_1.subscription.plan)) # customer_1 added one website customer_1.add_website(website_1) print("{} has added website {} as per the {} plan".format(customer_1, \ customer_1.websites, customer_1.subscription.plan)) # customer_1 can not add more website in single_plan customer_1.add_website(website_2) print("{} can't add website {} as per the {} plan".format(customer_1, \ website_2, customer_1.subscription.plan)) # customer_1 can change plan from single_plan to plus_plan customer_1.change_plan(plus_plan) print("{} has changed his current plan {} to {} plan".format(customer_1, \ single_plan, customer_1.subscription.plan)) # customer_2 subscribe for infinite_plan customer_2.add_subscription(infinite_plan) # customer_2 can add multiple websites customer_2.add_website(website_1) customer_2.add_website(website_2) customer_2.add_website(website_3) customer_2.add_website(website_4) print("{} has added four websites {} under infinite plan".format(customer_2, \ customer_2.websites))
def go(self): self.work_pages(self.site) self.session.add(Website(url=self.site, title='', domain=self.site, pages_count=self.pages_count, HTML_version=0.0)) self.session.commit()
def __init__(self, scope: core.Construct, id: str, **kwargs) -> None: super().__init__(scope, id, **kwargs) ticker = dynamodb.Attribute( name='Ticker', type=dynamodb.AttributeType.STRING, ) date = dynamodb.Attribute( name='Date', type=dynamodb.AttributeType.STRING, ) table = dynamodb.Table( self, 'StockHistory', partition_key=ticker, sort_key=date, billing_mode=dynamodb.BillingMode.PAY_PER_REQUEST, removal_policy=core.RemovalPolicy.DESTROY, point_in_time_recovery=True, ) index_name = 'Date-index' table.add_global_secondary_index( index_name=index_name, partition_key=date, sort_key=ticker, projection_type=dynamodb.ProjectionType.INCLUDE, non_key_attributes=['Name']) Importer(self, 'Importer', table=table) restapi = RestApi(self, 'Api', table=table, index_name=index_name) Website(self, 'Website', api=restapi.api)
def generate_relation_dict(self, news_sources, news_targets): ''' generates a dictionary of string/list(int) in the format {source : target_count} ie. {s1 : [tc1, tc2, ... tcn], s2 : [tc1, tc2, ... tcn], ... sn : [tc1, tc2, ... tcn]} where sn is the source, tcn is the citation count of each target ''' # initialize the relation dictionary. relation_dict = {} for source_name, source_url in news_sources.iteritems(): # create an empty list with a specific size which describe the number # of target referenced by each source target_count = [0] * len(news_targets) # Find the articles which have a specific source website url articles = Article.objects( Q(website=Website.objects(homepage_url=source_url).only('homepage_url').first()) & Q(citations__exists=True)).only('citations') for article in articles: # Count the times that each target in the news_targets is in the # citation list for each article and put it in the target_count for citation in article.citations: if not isinstance( citation, int ): i = 0 while i < len(news_targets): if citation.target_name.lower() == news_targets.keys()[i].lower(): target_count[i] += 1 i += 1 relation_dict[source_name] = target_count return relation_dict
def main(): logger.info("Cartriage v5.0") parser = argparse.ArgumentParser( description="Retrieves information from printers.") parser.add_argument( "l", type=open, metavar="printers", help="Text file containing printer IP addresses, one for each line.") parser.add_argument("o", metavar="output", help="Filename for resulting HTML page.") parser.add_argument("-v", action="store_true", help="Enable verbose mode.") try: args = parser.parse_args() if args.v: logger.info("Enabled verbose mode") logger.setLevel(logging.DEBUG) logger.debug(args) startTime = time.time() time.clock() scanned, successfullyScanned, printers = runScan(args.l) elapsedTime = "%d seconds" % (time.time() - startTime) site = Website(scanned, successfullyScanned, printers, elapsedTime) with open(args.o, "w") as output: output.write(str(site)) logger.info("Done! Results available in file: %s" % args.o) sys.exit(0) except IOError, e: logger.error(str(e)) sys.exit(1)
def get_website(self, url: str, check_interval: int): """ Instantiates Website instance. Safely returns instance or None depending on success. PARAMETERS: check_interval: Positive integer in seconds. Ping refresh freuency e.g. 30 would equate to check every 30 seconds url: String e.g. http://google.fr Instantiates Website instance. RETURNS: Website instance or None. """ try: website = Website(url=url, check_interval=check_interval) except Exception: print( "I wasn't able to connect with that URL.\n" + "Please revise it, including 'http://'" + " or 'https://' as appropriate)." ) return None return website
def __init__(self, url, params, headers, cnx, cursor): self.url = url self.params = params self.headers = headers self.cnx = cnx self.cursor = cursor self.website = Website(url, params, headers, cursor, cnx)
def get_articles(self, number=None): global username show_article_template = Template(filename='get_articles.html') sources = User.objects(name=username).first().news_sources targets = User.objects(name=username).first().news_targets articles = [] for s in sources: articles += Article.objects(website=Website.objects(name=s).first()).only('title', 'url').all() for t in targets: articles += Article.objects(website=Website.objects(name=t).first()).only('title', 'url').all() if not number: number = len(articles) return show_article_template.render(articles=articles[ :int(number)])
def test_city(): cities = [] for city in URL_CITY_ARRAY: location = Website(city) location.set_directory('./wikipedia/') html = location.get_html() cities.append(location) """ cities = [] fact_book = [] #for country in CIA_FACT_BOOK: # CIA.cia_indexer(Website(country)) for city in URL_CITY_ARRAY: cities.append(Website(city)) for city in cities: wikipedia.wiki_study_city(city) """ return
def getWebsites(self): websites = dict() with open('data1/websites.json') as data_file: websitesData = json.load(data_file)['websites'] for website in websitesData: websites[website['id']] = Website(website) return websites
def process_manifest(): """Processes all websites in the manifest. """ m = manifest.MANIFEST for i in range(0, len(m)): entry = m[i] txt_clean = FILEPATH_PREFIX + entry[0] + FILEPATH_TEXT_SUFFIX_CLEAN txt_block = FILEPATH_PREFIX + entry[0] + FILEPATH_IMAGE_SUFFIX_BLOCK img_clean = FILEPATH_PREFIX + entry[0] + FILEPATH_IMAGE_SUFFIX_CLEAN img_block = FILEPATH_PREFIX + entry[0] + FILEPATH_IMAGE_SUFFIX_BLOCK website_clean = Website(txt_clean, img_clean, "clean") website_block = Website(txt_block, img_block, "block") pair = WebsitePair(website_clean, website_block) if i == 0: print(get_csv_header(website_clean, website_block, pair)) print(get_csv_string(website_clean, website_block, pair))
def generate_text(self, sites_file, search_limit, keep_to_sites): with open(sites_file, "r") as f: for site in f.readlines(): sys.stderr.write("Working on: " + site + '\n') ws = Website(home_page=site, search_limit=search_limit, keep_to_site=keep_to_sites) self.sites.append(ws) for site in self.sites: for link in site.links: self.text += site.get_page_text(link)
def read_file(filename): """ Reads a file and returns a list of Website objects """ lines = [] with open(filename) as file: for line in file: url, interval = line.split(' ') interval = int(interval) website = Website(url, interval) lines.append(website) return lines
def start(): """Fetch list of web pages asynchronously.""" websites = Website.all() start_time = default_timer() loop = asyncio.new_event_loop() # create event loop asyncio.set_event_loop(loop) # set event loop future = asyncio.ensure_future(fetch_all(websites)) # tasks to do loop.run_until_complete(future) # loop until done tot_elapsed = default_timer() - start_time helper.p('Total time: {0:5.2f}'.format(tot_elapsed))
class TopBetEu: def __init__(self, url, headers, params, cnx, cursor): self.url = url self.params = params self.headers = headers self.cnx = cnx self.cursor = cursor self.website = Website(url, params, headers, cursor, cnx) def parse(self): # get the webpage soup soup = self.website.soup() # for each game games = [] for eventdiv in soup.find_all('div', class_='event'): # read the game header header = eventdiv.find('h3').text # read the teams match = re.search('(\w.+) at (\w.+) ', header) awayteamstr = re.sub('-.+', '', match.group(1).replace('-N','').replace('-A','')).replace('.', '%') hometeamstr = re.sub('-.+', '', match.group(2).replace('-N','').replace('-A','')).replace('.', '%') # read the gametime match = re.search('(....)-(..)-(..)\s+(..):(..)', header) gametime = datetime( int(match.group(1)), int(match.group(2)), int(match.group(3)) ) # read the lines awaycell, homecell = [line for line in eventdiv.find_all('td', class_='money')] awayline = int(awaycell.text) homeline = int(homecell.text) games.append(( awayteamstr, awayline, hometeamstr, homeline, gametime, )) return games def __repr__(self): return '{0}({1},{2},{3})'.format(self.__class__.__name__, self.url, self.params, self.headers)
def getSites(self): global conn global cur cur.execute("SELECT * FROM sites") sitesData = cur.fetchall() allSiteObjs = [] for site in sitesData: siteObj = Website(site['id'], site['name'], site['url'], site['searchUrl'], site['resultListing'], site['resultUrl'], site['absoluteUrl'], site['pageTitle'], site['pageBody']) allSiteObjs.append(siteObj) return allSiteObjs
def main(): """ Main producer pipeline :return: None """ db_credentials = form_db_credentials() message_broker_credentials = form_message_broker_credentials() create_tables(db_credentials, tables) while True: data_to_check = get_data_to_check(db_credentials) time_to_sleep = get_sleep_time(db_credentials) if not data_to_check: sleep(time_to_sleep) logger.info(f'sleeping {time_to_sleep} seconds') continue tasks = [] for row in data_to_check: url, regexp = row website = Website(url, message_broker_credentials, regexp) tasks.append(website.perform_check()) loop = asyncio.get_event_loop() loop.run_until_complete(asyncio.gather(*tasks)) logger.info(f'sleeping {time_to_sleep} seconds') sleep(time_to_sleep)
def analyse_URL(jsonData): """ Decide whether a website is phishing using its keywords and a Google search based on those. Parameters ---------- jsonData: contains site data """ ws = Website(json=jsonData) print(datetime.now().strftime("%H:%M:%S.%f") + "-- building vector") # build feature vector feat_vec_temp = {} feat_vect_site = build_feat_vec.feature_vector(extractor, ws) feat_vec_temp[0] = feat_vect_site feat_vect = DataFrame(feat_vec_temp) feat_vect = feat_vect.transpose().fillna(0) # prediction using gradient boosing exp = "238" features = feat_vect.columns print(datetime.now().strftime("%H:%M:%S.%f") + "-- vector done, start gradient boosting:") scoregb, predictiongb = _predict_gb(1, feat_vect, features, exp) gb_results = scoregb, predictiongb print(datetime.now().strftime("%H:%M:%S.%f") + "-- gradient done") global keep_track if keep_track: if gb_results[1] == 1: JSONtoFile(jsonData, True, jsonData['siteid']) else: JSONtoFile(jsonData, False, jsonData['siteid']) return gb_results, jsonData['jspageid'], jsonData['siteid']
def generate_relation_dict_beta(self, news_sources, news_targets): relation_dict = {} for source_name in news_sources: # create an empty list with a specific size which describe the number # of target referenced by each source target_count = [0] * len(news_targets) # Find the articles which have a specific source website url articles = Article.objects( Q(website=Website.objects(name=source_name).only('name').first()) & Q(citations__exists=True)).only('citations') for article in articles: # Count the times that each target in the news_targets is in the # citation list for each article and put it in the target_count for citation in article.citations: if not isinstance( citation, int ): i = 0 while i < len(news_targets): if citation.target_name.lower() == news_targets[i].lower(): target_count[i] += 1 i += 1 relation_dict[source_name] = target_count return relation_dict
def test_get_alexa_rank(website): websiteInstance = Website("www.gwern.net", None) print websiteInstance.get_alexa_rank("www.gwern.net") print websiteInstance.get_alexa_rank("www.a16z.com") print websiteInstance.get_alexa_rank("www.lesswrong.com") print websiteInstance.get_alexa_rank("www.facebook.com") print websiteInstance.get_alexa_rank("www.amazon.com")
#! /usr/bin/python from website import Website import sys siteObj = Website(sys.argv[1:]) siteObj.baseSearchURL = 'https://mail.google.com/mail/u/0/#inbox'
class Pinnacle: def __init__(self, url, params, headers, cnx, cursor): self.url = url self.params = params self.headers = headers self.cnx = cnx self.cursor = cursor self.website = Website(url, params, headers, cursor, cnx) def parse(self): print 'parsing:', str(self) # get the webpage soup soup = self.website.soup() # the datatables tables = soup.find_all('table', class_='linesTbl') # slurp up rows (they come in groups of three) gamerows = {} for table in tables: # get the date for this table datestr = table.select('.linesHeader')[0].find('h4').text match = re.search('(\d{0,2})/(\d{0,2})', datestr) month = int(match.group(1)) day = int(match.group(2)) date = datetime.date(2015, month, day) gamerows[date] = [] # sigh, go through all colors of table for row in table.select('.linesAlt1'): gamerows[date].append(row) for row in table.select('.linesAlt2'): gamerows[date].append(row) # group rows into 3 tuples # http://code.activestate.com/recipes/303060-group-a-list-into-sequential-n-tuples/ gametuples = {} for date in gamerows: gametuples[date] = [] for i in range(0, len(gamerows[date]), 3): tup = gamerows[date][i:i+3] if len(tup) == 3: gametuples[date].append(tuple(tup)) # go through for times and lines lines = [] for date in gametuples: for linerowa, linerowb, draw in gametuples[date]: # get the lines lineaname = linerowa.select('.linesTeam')[0].text linebname = linerowb.select('.linesTeam')[0].text linealine = float(linerowa.select('.linesMLine')[0].text or -1) linebline = float(linerowb.select('.linesMLine')[0].text or -1) drawline = float(draw.select('.linesMLine')[0].text or -1) lines.append((lineaname, linealine, linebname, linebline, drawline, datetime.datetime.combine(date, datetime.time()))) return lines def __repr__(self): return '{0}({1})'.format(self.__class__.__name__, self.website)