예제 #1
0
def scrapeFromQueue(q, config):
    task = q.get()

    try:
        # if there are "None" tasks in the queue, we are done
        if task is None:
            return True

        # get HTML
        with LogTimer("Load and DOM page"):
            htmlDOM = loadPage(task.request)

        # if no hint, need to create one
        if task.hint is None:
            task.hint = parserfactory.getParserHint(task.request)

        # determine how to parse HTML
        parser = parserfactory.ParserFactory(q, config, task.hint)

        # parse HTML
        if parser is not None:
            with LogTimer("Parse page ({})".format(str(task))):
                parser.parse(htmlDOM, task.data)
    except:
        stacktraceText = traceback.format_exc()
        logging.error(stacktraceText)
        with createConnFromConfig(config) as conn, conn.cursor() as cursor:
            cursor.execute(
                'INSERT INTO crawler.error (task, error_description) VALUES (%s, %s)',
                (pickle.dumps(task), stacktraceText)
            )  # TODO: Verify this works after converting to psycopg2. May need to escape_bytea

    q.task_done()

    return False
예제 #2
0
def loadPage(requestObj):
    """
    Loads a page from a url with data
    """

    url = requestObj.url
    data = requestObj.data
    requestType = requestObj.requestType
    forceReload = requestObj.forceReload

    requestData = urllib.parse.urlencode(data)

    sortedDataString = "_".join([str(i) + "=" + str(data[i]) for i in sorted(data.keys())])
    cacheFilename = url
    if (len(sortedDataString) > 0):
        cacheFilename += "_" + requestType + "_" + sortedDataString

    cacheFilename = cacheFilename.replace("https://","").replace("http://", "").replace("/", "_")
    if (not os.path.exists(CACHE_DIR)):
        os.makedirs(CACHE_DIR)

    cacheFilename = CACHE_DIR + cacheFilename
    if (not forceReload and os.path.exists(cacheFilename)):
        with LogTimer("Load web page (cached)", TimerType.WEB):
            cachedPage = open(cacheFilename)
            tidiedPage = cachedPage.read()
            cachedPage.close()
        return BeautifulSoup(tidiedPage, "html.parser")

    try:
        if requestType == "GET":
            # TODO: use Requests instead so we can send appropriate headers
            # this is useful since dance.zsconcepts.com uses the "referer"
            # will also be useful to set user-agent
            url = url + ("?" if len(data) != 0 else "") + requestData

        urlRequest = urllib.request.Request(url, data=requestData.encode("ascii"), headers=requestObj.headers)

        with LogTimer("Load web page", TimerType.WEB):
            response = urllib.request.urlopen(urlRequest)

        html_response = response.read()
        encoding = response.headers.get_content_charset("utf-8")
        decoded_html = html_response.decode(encoding)
        tidiedPage, pageErrors = tidy_document(decoded_html)

        cachedPage = open(cacheFilename, "w+")
        cachedPage.write(tidiedPage)
        cachedPage.close()

        return BeautifulSoup(tidiedPage, "html.parser")
    except urllib.error.HTTPError:
        logging.error("Failed to fetch %s with request: %s" % (url, requestObj))
        return None
예제 #3
0
    def parse(self, htmlDOM, data):
        compId = data['compId']
        with LogTimer('Parse comp {}'.format(compId), TimerType.PARSE):
            tables = htmlDOM.find_all('table')
            mainTable = tables[1]
            rows = mainTable.find_all('tr')

            logging.info("Scraping " + compId)

            # Find first link
            rowNum = 0
            eventNum = 0
            while rowNum < len(rows):
                if (rows[rowNum].find('a') != None):
                    break
                rowNum += 1

            lastHeatName = None
            lastHeatLink = None
            # Keep finding appropriate links
            # As before, read first event page, read all heats of event
            events: List[EventData] = []
            entries: List[EventEntry] = []
            while rowNum < len(rows):
                rowText = rows[rowNum].get_text().strip()

                # Blank row
                if rowText == '----':
                    pass

                # Row is a link. Read event and heats.
                elif (rows[rowNum].find('a') != None):
                    lastHeatName, lastHeatId, lastHeatLink = _parseHeatLink(rows[rowNum])
                    if ("combine" not in lastHeatName.lower()):
                        events.append(EventData(compId, lastHeatId, lastHeatName, lastHeatLink, eventNum))
                        eventNum += 1

                # Row is a couple
                elif (lastHeatName is not None and "combine" not in lastHeatName.lower()):
                    coupleNum, leaderName, followerName, placement, coupleLocation = _parseEntry(rows[rowNum].get_text().strip())
                    entries.append(EventEntry(compId, lastHeatId, coupleNum, leaderName, followerName, placement, coupleLocation))
                rowNum += 1

        with LogTimer('Store event data {}'.format(compId), TimerType.DB):
            self._storeEvents(events)

        with LogTimer('Store entry data {}'.format(compId), TimerType.DB):
            self._storeEventEntries(entries)

        for event in events:
            self._enqueueRounds(event)
예제 #4
0
 def _resetData(self, compId):
     with createConnFromConfig(self.config) as conn, conn.cursor() as cursor, LogTimer("Clean {}".format(compId), TimerType.DB):
         cursor.execute("DELETE FROM o2cm.judge WHERE comp_id = %s", (compId, ))
         cursor.execute("DELETE FROM o2cm.round_result WHERE comp_id = %s", (compId, ))
         cursor.execute("DELETE FROM o2cm.round_placement WHERE comp_id = %s", (compId, ))
         cursor.execute("DELETE FROM o2cm.entry WHERE comp_id = %s", (compId, ))
         cursor.execute("DELETE FROM o2cm.event WHERE comp_id = %s", (compId, ))
         cursor.execute("DELETE FROM o2cm.competition WHERE comp_id = %s", (compId, ))
예제 #5
0
    def parse(self, htmlDOM, data):
        compId = data["compId"]
        eventId = data["eventId"]
        roundNum = data["roundNum"]

        logging.info("Scraping {}, {} Round {}".format(compId, eventId,
                                                       roundNum))
        with LogTimer(
                'Parse {}, {} Round {}'.format(compId, eventId, roundNum),
                TimerType.PARSE):
            # find all tables
            # last table will be for listing competitors and judges
            # if it is a final with multiple dances, there will also be a table for the summary
            isFinal = roundNum == 0

            tables = htmlDOM.find_all('table', class_='t1n')
            numResultTables = len(tables) - 1
            resultTables = tables[:numResultTables]
            summaryTable = None
            coupleAndJudgesTable = tables[-1]

            if isFinal and numResultTables > 1:  # multi-dance final
                resultTables = tables[:numResultTables - 1]
                summaryTable = tables[numResultTables - 1]

            placements: List[RoundPlacement] = []
            results: List[RoundResult] = []
            for table in resultTables:
                tablePlacements, tableResults = self.parseTable(
                    compId, eventId, roundNum, table, isFinal)
                placements += tablePlacements
                results += tableResults

            judges: List[JudgeInfo] = self.parseJudgeTable(
                compId, eventId, roundNum, coupleAndJudgesTable,
                len(getJudgeHeaders(resultTables[0])))

        with LogTimer('Save {}, {} Round {}'.format(compId, eventId, roundNum),
                      TimerType.DB):
            self.storePlacements(placements)
            self.storeResults(results)
            self.storeJudges(judges)
예제 #6
0
    def parse(self, htmlDOM, data):
        with LogTimer('Parsing main', TimerType.PARSE):
            compsOfInterest = getConfigProperty(self.config, 'scraper', 'o2cm', 'comps', default=[])

            compLinks = htmlDOM.find_all('a')
            yearInput = htmlDOM.find_all('input', id='inyear')[0]
            year = int(yearInput['value'])
            monthInput = htmlDOM.find_all('input', id='inmonth')[0]
            month = int(monthInput['value'])

            logging.info("Scraping o2cm: %d %d" % (year, month))

            minYear = int(yearInput['min'])
            month -= 1

            if month < 1:
                month = 12
                year -= 1

            if year >= minYear:
                self._createNextMainPageRequest(year, month)

            for tag in compLinks:
                date = tag.parent.previous_sibling.previous_sibling.string.strip()
                compName = tag.get_text()
                link = tag['href']
                m = re.match(r'event[23].asp\?event=([a-zA-Z]{0,4}\d{0,5}[a-zA-Z]?)&.*', link)
                compId = m.group(1).lower()

                if (len(compsOfInterest) == 0 or compId in compsOfInterest):
                    m = re.match(r'([a-z]+)\d+.*', compId)
                    # compCode = m.group(1)
                    fullDate = date + " " + str(year)

                    self._resetData(compId)

                    self._storeData(compId, compName, fullDate)
                    self._createCompPageRequest(compId, compName)
예제 #7
0
 def _storeData(self, compId, compName, compDate):
     with createConnFromConfig(self.config) as conn, conn.cursor() as cursor, LogTimer("Save {}".format(compId), TimerType.DB):
         cursor.execute("INSERT INTO o2cm.competition (comp_id, comp_name, comp_date) VALUES (%s, %s, %s)", (compId, compName, compDate))
예제 #8
0
            cursor.execute(
                "SELECT error_id, task, error_description FROM crawler.error")
            exceptions = cursor.fetchall()
            for e in exceptions:
                task = pickle.loads(conn.unescape_bytea(e[1]))
                q.put(task)
            cursor.execute("DELETE FROM crawler.error")

    for url in args.seedUrls:
        request = WebRequest(url)
        request.forceReload = args.reload
        seedTask = ScraperTask(request, {'url': url})
        q.put(seedTask)

    # start workers
    with LogTimer('Total work', TimerType.GENERAL):
        workers = []
        for i in range(args.numWorkers[0]):
            worker = WorkerThread(q, config)
            worker.start()
            workers.append(worker)

        try:
            # block until we finish scraping
            q.join()
        except CrawlerExit:
            logging.info("Stopping crawler")
            for worker in workers:
                worker.stop()
        finally:
            # stop workers