def scrapeFromQueue(q, config): task = q.get() try: # if there are "None" tasks in the queue, we are done if task is None: return True # get HTML with LogTimer("Load and DOM page"): htmlDOM = loadPage(task.request) # if no hint, need to create one if task.hint is None: task.hint = parserfactory.getParserHint(task.request) # determine how to parse HTML parser = parserfactory.ParserFactory(q, config, task.hint) # parse HTML if parser is not None: with LogTimer("Parse page ({})".format(str(task))): parser.parse(htmlDOM, task.data) except: stacktraceText = traceback.format_exc() logging.error(stacktraceText) with createConnFromConfig(config) as conn, conn.cursor() as cursor: cursor.execute( 'INSERT INTO crawler.error (task, error_description) VALUES (%s, %s)', (pickle.dumps(task), stacktraceText) ) # TODO: Verify this works after converting to psycopg2. May need to escape_bytea q.task_done() return False
def loadPage(requestObj): """ Loads a page from a url with data """ url = requestObj.url data = requestObj.data requestType = requestObj.requestType forceReload = requestObj.forceReload requestData = urllib.parse.urlencode(data) sortedDataString = "_".join([str(i) + "=" + str(data[i]) for i in sorted(data.keys())]) cacheFilename = url if (len(sortedDataString) > 0): cacheFilename += "_" + requestType + "_" + sortedDataString cacheFilename = cacheFilename.replace("https://","").replace("http://", "").replace("/", "_") if (not os.path.exists(CACHE_DIR)): os.makedirs(CACHE_DIR) cacheFilename = CACHE_DIR + cacheFilename if (not forceReload and os.path.exists(cacheFilename)): with LogTimer("Load web page (cached)", TimerType.WEB): cachedPage = open(cacheFilename) tidiedPage = cachedPage.read() cachedPage.close() return BeautifulSoup(tidiedPage, "html.parser") try: if requestType == "GET": # TODO: use Requests instead so we can send appropriate headers # this is useful since dance.zsconcepts.com uses the "referer" # will also be useful to set user-agent url = url + ("?" if len(data) != 0 else "") + requestData urlRequest = urllib.request.Request(url, data=requestData.encode("ascii"), headers=requestObj.headers) with LogTimer("Load web page", TimerType.WEB): response = urllib.request.urlopen(urlRequest) html_response = response.read() encoding = response.headers.get_content_charset("utf-8") decoded_html = html_response.decode(encoding) tidiedPage, pageErrors = tidy_document(decoded_html) cachedPage = open(cacheFilename, "w+") cachedPage.write(tidiedPage) cachedPage.close() return BeautifulSoup(tidiedPage, "html.parser") except urllib.error.HTTPError: logging.error("Failed to fetch %s with request: %s" % (url, requestObj)) return None
def parse(self, htmlDOM, data): compId = data['compId'] with LogTimer('Parse comp {}'.format(compId), TimerType.PARSE): tables = htmlDOM.find_all('table') mainTable = tables[1] rows = mainTable.find_all('tr') logging.info("Scraping " + compId) # Find first link rowNum = 0 eventNum = 0 while rowNum < len(rows): if (rows[rowNum].find('a') != None): break rowNum += 1 lastHeatName = None lastHeatLink = None # Keep finding appropriate links # As before, read first event page, read all heats of event events: List[EventData] = [] entries: List[EventEntry] = [] while rowNum < len(rows): rowText = rows[rowNum].get_text().strip() # Blank row if rowText == '----': pass # Row is a link. Read event and heats. elif (rows[rowNum].find('a') != None): lastHeatName, lastHeatId, lastHeatLink = _parseHeatLink(rows[rowNum]) if ("combine" not in lastHeatName.lower()): events.append(EventData(compId, lastHeatId, lastHeatName, lastHeatLink, eventNum)) eventNum += 1 # Row is a couple elif (lastHeatName is not None and "combine" not in lastHeatName.lower()): coupleNum, leaderName, followerName, placement, coupleLocation = _parseEntry(rows[rowNum].get_text().strip()) entries.append(EventEntry(compId, lastHeatId, coupleNum, leaderName, followerName, placement, coupleLocation)) rowNum += 1 with LogTimer('Store event data {}'.format(compId), TimerType.DB): self._storeEvents(events) with LogTimer('Store entry data {}'.format(compId), TimerType.DB): self._storeEventEntries(entries) for event in events: self._enqueueRounds(event)
def _resetData(self, compId): with createConnFromConfig(self.config) as conn, conn.cursor() as cursor, LogTimer("Clean {}".format(compId), TimerType.DB): cursor.execute("DELETE FROM o2cm.judge WHERE comp_id = %s", (compId, )) cursor.execute("DELETE FROM o2cm.round_result WHERE comp_id = %s", (compId, )) cursor.execute("DELETE FROM o2cm.round_placement WHERE comp_id = %s", (compId, )) cursor.execute("DELETE FROM o2cm.entry WHERE comp_id = %s", (compId, )) cursor.execute("DELETE FROM o2cm.event WHERE comp_id = %s", (compId, )) cursor.execute("DELETE FROM o2cm.competition WHERE comp_id = %s", (compId, ))
def parse(self, htmlDOM, data): compId = data["compId"] eventId = data["eventId"] roundNum = data["roundNum"] logging.info("Scraping {}, {} Round {}".format(compId, eventId, roundNum)) with LogTimer( 'Parse {}, {} Round {}'.format(compId, eventId, roundNum), TimerType.PARSE): # find all tables # last table will be for listing competitors and judges # if it is a final with multiple dances, there will also be a table for the summary isFinal = roundNum == 0 tables = htmlDOM.find_all('table', class_='t1n') numResultTables = len(tables) - 1 resultTables = tables[:numResultTables] summaryTable = None coupleAndJudgesTable = tables[-1] if isFinal and numResultTables > 1: # multi-dance final resultTables = tables[:numResultTables - 1] summaryTable = tables[numResultTables - 1] placements: List[RoundPlacement] = [] results: List[RoundResult] = [] for table in resultTables: tablePlacements, tableResults = self.parseTable( compId, eventId, roundNum, table, isFinal) placements += tablePlacements results += tableResults judges: List[JudgeInfo] = self.parseJudgeTable( compId, eventId, roundNum, coupleAndJudgesTable, len(getJudgeHeaders(resultTables[0]))) with LogTimer('Save {}, {} Round {}'.format(compId, eventId, roundNum), TimerType.DB): self.storePlacements(placements) self.storeResults(results) self.storeJudges(judges)
def parse(self, htmlDOM, data): with LogTimer('Parsing main', TimerType.PARSE): compsOfInterest = getConfigProperty(self.config, 'scraper', 'o2cm', 'comps', default=[]) compLinks = htmlDOM.find_all('a') yearInput = htmlDOM.find_all('input', id='inyear')[0] year = int(yearInput['value']) monthInput = htmlDOM.find_all('input', id='inmonth')[0] month = int(monthInput['value']) logging.info("Scraping o2cm: %d %d" % (year, month)) minYear = int(yearInput['min']) month -= 1 if month < 1: month = 12 year -= 1 if year >= minYear: self._createNextMainPageRequest(year, month) for tag in compLinks: date = tag.parent.previous_sibling.previous_sibling.string.strip() compName = tag.get_text() link = tag['href'] m = re.match(r'event[23].asp\?event=([a-zA-Z]{0,4}\d{0,5}[a-zA-Z]?)&.*', link) compId = m.group(1).lower() if (len(compsOfInterest) == 0 or compId in compsOfInterest): m = re.match(r'([a-z]+)\d+.*', compId) # compCode = m.group(1) fullDate = date + " " + str(year) self._resetData(compId) self._storeData(compId, compName, fullDate) self._createCompPageRequest(compId, compName)
def _storeData(self, compId, compName, compDate): with createConnFromConfig(self.config) as conn, conn.cursor() as cursor, LogTimer("Save {}".format(compId), TimerType.DB): cursor.execute("INSERT INTO o2cm.competition (comp_id, comp_name, comp_date) VALUES (%s, %s, %s)", (compId, compName, compDate))
cursor.execute( "SELECT error_id, task, error_description FROM crawler.error") exceptions = cursor.fetchall() for e in exceptions: task = pickle.loads(conn.unescape_bytea(e[1])) q.put(task) cursor.execute("DELETE FROM crawler.error") for url in args.seedUrls: request = WebRequest(url) request.forceReload = args.reload seedTask = ScraperTask(request, {'url': url}) q.put(seedTask) # start workers with LogTimer('Total work', TimerType.GENERAL): workers = [] for i in range(args.numWorkers[0]): worker = WorkerThread(q, config) worker.start() workers.append(worker) try: # block until we finish scraping q.join() except CrawlerExit: logging.info("Stopping crawler") for worker in workers: worker.stop() finally: # stop workers