Python Crawler示例，crawler.Crawler.Crawler Python示例

示例#1

0

显示文件

 def __init__(self, coursecode, connection, cycle, term, requisitiontype):
     self.coursecode = coursecode
     self.crawler = Crawler()
     self.connection = connection
     self.cycle = cycle
     self.term = term
     self.requisitiontype = requisitiontype

示例#2

0

显示文件

文件： CycleReader.py 项目： SuperNovaPOLIUSP/supernova

 def __init__(self):
     self.connection = MySQLConnection()
     self.timeperiod = None
     self.cycle = None
     self.coursereader = None
     self.offerreader = None
     self.crawler = Crawler()
     self.offerreader = OfferReader(self.connection, self.cycle,
                                    self.timeperiod)

示例#3

0

显示文件

文件： CrawlerTest.py 项目： Inquizarus/pycrawler

 def testItReturnsTheCrawledUrls(self):
     crawler = Crawler()
     urlsToCrawl = ['http://google.se', 'http://aftonbladet.se']
     for url in urlsToCrawl:
         crawler.add_to_crawl(url)
     result = crawler.crawl()
     self.assertEquals(
         urlsToCrawl, result,
         'Not all urls that was supposed to be crawled was crawled.')

示例#4

0

显示文件

文件： main.py 项目： pascalweiss/SearchEngine

def main():
    """ An example how the search engine could be used.  """

    seed = [
        'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d01.html',
        'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d06.html',
        'http://people.f4.htw-berlin.de/fileadmin/user_upload/Dozenten/WI-Dozenten/Classen/DAWeb/smdocs/d08.html'
    ]

    # Instatiate the crawler.
    crawler = Crawler()

    # Start the crawler with the seed.
    crawler.start_crawling(seed)

    # Access the pages.
    pages = crawler.pages

    # Print the content of the pages
    print(pages)

    # Print the link structure
    link_structure_txt = pages.get_link_structure_text()
    print(link_structure_txt)

    # Printing and creation of the index
    indexer = Indexer()
    indexer.index_pages(pages)
    index = indexer.index
    print(index)

    # Calculation and Printing of Page Rank
    pagerank = Page_Rank()
    pagerank.fill_matrix(crawler)
    pagerank.calculate_probabilities(0.05, 0.95)
    pagerank.calculate_page_rank(0.04)
    print()

    # Scoring
    example_queries = ['tokens', 'index', 'classification', 'tokens classification' ]
    analyzer = CosinusAnalyzer(index, pages)
    print(analyzer.get_length_of_pages_text())

    # Cosinus Scoring
    print(StringUtil.header('cosine_scores.txt'))
    for query in example_queries:
        hits = analyzer.analyze(query)
        print(hits)
    print()

    # Cosinus Scoring combined with the page rank.
    print(StringUtil.header('Cosinus combined with Page Rank'))
    for query in example_queries:
        hits = analyzer.analyze(query, combine_with_page_rank=True)
        print(hits)
    print()

示例#5

0

显示文件

文件： CrawlerTest.py 项目： pharmajoe990/recipe_scraper

 def test_get_children(self):
     page = "http://www.taste.com.au/"
     # Exclude depth for now
     depth = 3
     # children = 3
     crawler = Crawler(page, depth)
     crawler.run()
     # Should have depth*children urls from crawl result
     self.assertGreater(len(crawler.urls), 1)
     print crawler.urls

示例#6

0

显示文件

文件： CourseReader.py 项目： SuperNovaPOLIUSP/supernova

class CourseReader(object):
    """Object that reads the page of a course with a given coursecode and
    inserts all the relevant information in the database"""

    def __init__(self, coursecode, connection, cycle, term, requisitiontype):
        self.coursecode = coursecode
        self.crawler = Crawler()
        self.connection = connection
        self.cycle = cycle
        self.term = term
        self.requisitiontype = requisitiontype

    def scancoursepage(self):
        match = Course.find(courseCode_equal=self.coursecode)
        urlstart = 'https://uspdigital.usp.br/jupiterweb/obterDisciplina'
        parameters = {'sgldis': str(self.coursecode)}
        completeurl = appendparameters(urlstart, parameters)
        self.crawler.loadpage(completeurl)
        name = self.findname()
        startdate = self.findstartdate()
        if match:
            course = match[0]
        else:
            course = Course(self.coursecode, name, startdate)
            course.store()
        idealtermmatch = IdealTermCourse.find(idCycle=self.cycle.idCycle,
                                              term=self.term,
                                              idCourse=course.idCourse,
                                              requisitionType=
                                              self.requisitiontype)
        if not idealtermmatch:
            idealterm = IdealTermCourse(self.cycle.idCycle, self.term,
                                        startdate, self.requisitiontype,
                                        course)
            idealterm.store()
        else:
            idealterm = idealtermmatch[0]
        return course

    def findname(self):
        namedata = self.crawler.htmlpage.findAll('b', {})
        # The name will be on the third font tag found
        completename = getwhatisbetweenthetags(unicode(namedata[3]))
        name = completename[22:]  # Disciplina: XXXXXXX - name
        return name

    def findstartdate(self):
        startdatedict = {'class': 'txt_arial_8pt_gray'}
        startdatedata = self.crawler.htmlpage.findAll('span', startdatedict)
        # The startdate will be on the fourth tag found
        completedata = getwhatisbetweenthetags(unicode(startdatedata[4]))
        data = removewhitespaces(completedata)
        data = convertdate(data)
        return data

示例#7

0

显示文件

class CourseReader(object):
    """Object that reads the page of a course with a given coursecode and
    inserts all the relevant information in the database"""
    def __init__(self, coursecode, connection, cycle, term, requisitiontype):
        self.coursecode = coursecode
        self.crawler = Crawler()
        self.connection = connection
        self.cycle = cycle
        self.term = term
        self.requisitiontype = requisitiontype

    def scancoursepage(self):
        match = Course.find(courseCode_equal=self.coursecode)
        urlstart = 'https://uspdigital.usp.br/jupiterweb/obterDisciplina'
        parameters = {'sgldis': str(self.coursecode)}
        completeurl = appendparameters(urlstart, parameters)
        self.crawler.loadpage(completeurl)
        name = self.findname()
        startdate = self.findstartdate()
        if match:
            course = match[0]
        else:
            course = Course(self.coursecode, name, startdate)
            course.store()
        idealtermmatch = IdealTermCourse.find(
            idCycle=self.cycle.idCycle,
            term=self.term,
            idCourse=course.idCourse,
            requisitionType=self.requisitiontype)
        if not idealtermmatch:
            idealterm = IdealTermCourse(self.cycle.idCycle, self.term,
                                        startdate, self.requisitiontype,
                                        course)
            idealterm.store()
        else:
            idealterm = idealtermmatch[0]
        return course

    def findname(self):
        namedata = self.crawler.htmlpage.findAll('b', {})
        # The name will be on the third font tag found
        completename = getwhatisbetweenthetags(unicode(namedata[3]))
        name = completename[22:]  # Disciplina: XXXXXXX - name
        return name

    def findstartdate(self):
        startdatedict = {'class': 'txt_arial_8pt_gray'}
        startdatedata = self.crawler.htmlpage.findAll('span', startdatedict)
        # The startdate will be on the fourth tag found
        completedata = getwhatisbetweenthetags(unicode(startdatedata[4]))
        data = removewhitespaces(completedata)
        data = convertdate(data)
        return data

示例#8

0

显示文件

文件： cli.py 项目： aronlaszka/BugBounty

def start():
    es = create_es_connection(
        config['database']['host'],
        config['database']['port'],
        config['database']['access_key'],
        config['database']['secret_key'],
        config['database']['region'])
    threads = []
    for search in config['searches']:
        ed = ElasticDriver(es, search['name'])
        i = 1
        for twitterAccount in search['twitterAccounts']:
            td = TwitterDriver(
                search['keywords'],
                ed,
                search['sensitivity'],
                twitterAccount['consumer_key'],
                twitterAccount['consumer_secret'],
                twitterAccount['access_token_key'],
                twitterAccount['access_token_secret']
            )
            threads.append(Crawler(f'{search["name"]}-{i}', search['keywords'], es, td))
            i += 1

    for thread in threads:
        thread.start()
    for thread in threads:
        thread.join()

示例#9

0

显示文件

文件： CourseReader.py 项目： SuperNovaPOLIUSP/supernova

 def __init__(self, coursecode, connection, cycle, term, requisitiontype):
     self.coursecode = coursecode
     self.crawler = Crawler()
     self.connection = connection
     self.cycle = cycle
     self.term = term
     self.requisitiontype = requisitiontype

示例#10

0

显示文件

文件： CrawlerTest.py 项目： daoudclarke/tdd-workshop

    def test_crawler_recurses(self):
        # Arrange
        html = """
<html><body><a href="http://testurl.com/testpage.html">Link text</a></body></html>
"""
        initial_url = 'http://www.initialurl.com/'

        mock_urllib = create_autospec(urllib2)
        crawler = Crawler(mock_urllib)
        

        # Act
        crawler.crawl([initial_url])
        
        # Assert
        expected_calls = [call.urlopen(initial_url), call.urlopen('http://testurl.com/testpage.html')]
        mock_urllib.assert_has_calls(expected_calls)

示例#11

0

显示文件

文件： CycleReader.py 项目： SuperNovaPOLIUSP/supernova

 def __init__(self):
     self.connection = MySQLConnection()
     self.timeperiod = None
     self.cycle = None
     self.coursereader = None
     self.offerreader = None
     self.crawler = Crawler()
     self.offerreader = OfferReader(self.connection, self.cycle,
                                    self.timeperiod)

示例#12

0

显示文件

    def test_crawler_recurses(self):
        # Arrange
        html = """
<html><body><a href="http://testurl.com/testpage.html">Link text</a></body></html>
"""
        initial_url = 'http://www.initialurl.com/'

        mock_urllib = create_autospec(urllib2)
        crawler = Crawler(mock_urllib)

        # Act
        crawler.crawl([initial_url])

        # Assert
        expected_calls = [
            call.urlopen(initial_url),
            call.urlopen('http://testurl.com/testpage.html')
        ]
        mock_urllib.assert_has_calls(expected_calls)

示例#13

0

显示文件

文件： DocumentPickler.py 项目： Sebelino/ir13-project

def scrape_documents(
        min_count=0,
        url_seeds=GlobalConfiguration.DEFAULT_URL_SEEDS):

    doc_count = 0

    s = Crawler(url_seeds)
    docs = s.crawl(min_count)

    while min_count <= 0 or doc_count < min_count:
        for doc in docs:
            temp_file = get_new_file()
            pickle.dump(doc, temp_file)
            temp_file.close()
            log.debug('saved image doc from %s', doc.url)
            doc_count += 1
            if doc_count % 100 == 0:
                log.info('%d images and counting...', doc_count)

    log.info('finished indexing images.')
    log.info('%d documents indexed', doc_count)

示例#14

0

显示文件

class FacultyReader(object):
    """A reader object which will use a crawler to scan through the cycles
    of a faculty"""
    def __init__(self):
        self.connection = None
        self.timeperiod = None
        self.faculty = None
        self.crawler = Crawler()

    @staticmethod
    def initwithconnection(host, user, password, database):
        """Returns a new CycleReader object with its connection already
        configured using the host, user, password and database name
        provided as parameters. Additionally, it configures the charset
        to be used as unicode"""
        facultyreader = FacultyReader()
        facultyreader.connection = MySQLdb.connect(host=host,
                                                   user=user,
                                                   passwd=password,
                                                   db=database,
                                                   use_unicode=True,
                                                   charset='utf8')
        return facultyreader

    def settimeperiod(self, idtimeperiod):
        "Sets the timeperiod of this cycle by providing its id"
        self.timeperiod = idtimeperiod

    def setfaculty(self, idfaculty):
        "Sets the cycle of this reader by searching for the cycle in the bank"
        self.faculty = Faculty.pickById(idfaculty)

    def startreading(self):
        """Starts scanning through the Faculty's page and iterates through each
        of it's cycles"""
        urlstart = 'https://uspdigital.usp.br/jupiterweb/jupCursoLista'
        parameters = {'codcg': self.faculty.idFaculty, 'tipo': 'N'}
        completeurl = appendparameters(urlstart, parameters)
        self.crawler.loadpage(completeurl)

示例#15

0

显示文件

文件： FacultyReader.py 项目： SuperNovaPOLIUSP/supernova

class FacultyReader(object):
    """A reader object which will use a crawler to scan through the cycles
    of a faculty"""
    def __init__(self):
        self.connection = None
        self.timeperiod = None
        self.faculty = None
        self.crawler = Crawler()

    @staticmethod
    def initwithconnection(host, user, password, database):
        """Returns a new CycleReader object with its connection already
        configured using the host, user, password and database name
        provided as parameters. Additionally, it configures the charset
        to be used as unicode"""
        facultyreader = FacultyReader()
        facultyreader.connection = MySQLdb.connect(host=host, user=user,
                                                   passwd=password,
                                                   db=database,
                                                   use_unicode=True,
                                                   charset='utf8')
        return facultyreader

    def settimeperiod(self, idtimeperiod):
        "Sets the timeperiod of this cycle by providing its id"
        self.timeperiod = idtimeperiod

    def setfaculty(self, idfaculty):
        "Sets the cycle of this reader by searching for the cycle in the bank"
        self.faculty = Faculty.pickById(idfaculty)

    def startreading(self):
        """Starts scanning through the Faculty's page and iterates through each
        of it's cycles"""
        urlstart = 'https://uspdigital.usp.br/jupiterweb/jupCursoLista'
        parameters = {'codcg': self.faculty.idFaculty, 'tipo': 'N'}
        completeurl = appendparameters(urlstart, parameters)
        self.crawler.loadpage(completeurl)

示例#16

0

显示文件

文件： main.py 项目： hootan-nikbakht/simple-py-crawler

from crawler.Crawler import Crawler
if __name__ == '__main__':
   crawler = Crawler()
   crawler.crawl('http://www.prestigetime.com/')

示例#17

0

显示文件

文件： CycleReader.py 项目： SuperNovaPOLIUSP/supernova

class CycleReader(object):
    """A reader object which will use a crawler to scan through the page
    of a cycle"""
    def __init__(self):
        self.connection = MySQLConnection()
        self.timeperiod = None
        self.cycle = None
        self.coursereader = None
        self.offerreader = None
        self.crawler = Crawler()
        self.offerreader = OfferReader(self.connection, self.cycle,
                                       self.timeperiod)

    def settimeperiod(self, idtimeperiod):
        "Sets the timeperiod of this cycle by providing its id"
        self.timeperiod = idtimeperiod
        self.offerreader.timeperiod = TimePeriod.pickById(idtimeperiod)

    def setcycle(self, idcycle):
        "Sets the cycle of this reader by searching for the cycle in the bank"
        self.cycle = Cycle.pickById(idcycle)
        self.offerreader.cycle = self.cycle

    def startreadingcycles(self):
        """Starts scanning through the Cycle's page and iterates through each
        of it's courses. This function will not read 'Ciclo Básico' or 'Grande
        Área'"""
        urlstart = 'https://uspdigital.usp.br/jupiterweb/listarGradeCurricular'
        codcg = self.findidfaculty()
        codcurlist = self.findcodcur()
        codhab = str(self.cycle.cycleCode)
        for codcur in codcurlist:
            parameters = {
                'codcg': str(codcg),
                'codcur': str(codcur),
                'codhab': codhab,
                'tipo': 'N'
            }
            completeurl = appendparameters(urlstart, parameters)
            self.crawler.loadpage(completeurl)
            coursecodedata = self.crawler.htmlpage.findAll('table', {})
            coursecodedata = coursecodedata[1]  # The table with the courses
            codes = getcoursecodes(coursecodedata)
            if codes:
                break
        return self.startreadingoffers(codes)

    def findidfaculty(self):
        "returns the id of the faculty that has this cycle"
        relations = 'rel_courseCoordination_cycle r1, '\
            'rel_courseCoordination_faculty r2 '
        conditions = 'WHERE r1.idCourseCoordination = r2.idCourseCoordination'\
            ' AND r1.idCycle = ' + str(self.cycle.idCycle)
        query = 'SELECT idFaculty FROM ' + relations + conditions
        results = self.connection.execute(query)
        try:
            idfaculty = results[0][0]
        except IndexError:
            raise IndexError('Não conseguiu achar idFaculty,\
                             checar rel_courseCoordination_cycle e\
                             rel_courseCoordination_faculty com idCycle = ' +
                             str(self.cycle.idCycle))
            sys.exit()
        return idfaculty

    def findcodcur(self):
        """returns the idAcademicProgram representing the code for this
        object's cycle"""
        query = 'SELECT idAcademicProgram FROM rel_academicProgram_cycle '\
                'WHERE idCycle = ' + str(self.cycle.idCycle)
        codcurall = self.connection.execute(query)
        listcodcur = []
        for codcur in codcurall:
            listcodcur.append(codcur[0])
        return listcodcur

    def startreadingoffers(self, coursecodes):
        "Using the coursecodes list, reads all the offers from each code"
        requisitiontypetranslationdict = {0: 1, 1: 2, 2: 3}
        # In the bank: 1 - Obrigatória, 2 - Eletiva, 3 - Livre
        index = 0
        offers = []
        while index < len(coursecodes):
            for period in coursecodes[index]:
                for code in coursecodes[index][period]:
                    if code in COURSES_TO_IGNORE:
                        continue
                    reader = CourseReader(
                        code, self.connection, self.cycle, period,
                        requisitiontypetranslationdict[index])
                    course = reader.scancoursepage()
                    self.offerreader.setcourse(course)
                    offers.extend(self.offerreader.scancourseofferpage())
            index += 1
        return offers

示例#18

0

显示文件

文件： OfferReader.py 项目： SuperNovaPOLIUSP/supernova

 def __init__(self, connection, cycle, timeperiod):
     self.connection = connection
     self.timeperiod = timeperiod
     self.cycle = cycle
     self.crawler = Crawler()
     self.course = None

示例#19

0

显示文件

文件： OfferReader.py 项目： SuperNovaPOLIUSP/supernova

class OfferReader(object):
    """A reader object which will use a crawler to scan through the page
    describing the offers of a course"""
    def __init__(self, connection, cycle, timeperiod):
        self.connection = connection
        self.timeperiod = timeperiod
        self.cycle = cycle
        self.crawler = Crawler()
        self.course = None

    def setcourse(self, course):
        "Sets the course which will be scanned"
        if not isinstance(course, Course):
            raise Exception('Invalid course parameter')
        self.course = course

    def scancourseofferpage(self):
        "Scans the page of the course, adding new offers to the bank"
        urlstart = 'https://uspdigital.usp.br/jupiterweb/obterTurma'
        fullurl = appendparameters(urlstart,
                                   {'sgldis': self.course.courseCode})
        self.crawler.loadpage(fullurl)
        offers = []
        if re.match('[\S\s]*Hor&aacute;rio[\S\s]*',
                    str(self.crawler.htmlpage)):
            offersdata = self.getrelevantdata()
            offers.extend(self.createoffers(offersdata))
        return offers

    def checkprofessor(self, professorname):
        """Checks if the professor with the name passed as argument is already
        in the bank and, if it isn't, stores a new professor. After that,
        returns the Professor object. If the professor name is empty, there is
        a special entry in the bank for this case """
        professor = Professor.find(name_equal=professorname)
        if not professor:
            if not professorname:
                professor = Professor.find(name_equal=NO_PROFESSOR_STRING)[0]
            else:
                professor = Professor(professorname)
                professor.store()
        else:
            professor = professor[0]
        return professor

    def checkschedule(self, sched):
        """Checks if the schedule object passed as sched is already in the
        bank and, if it isn't, stores a new schedule"""
        if not sched.idSchedule:
            sched.store()

    def generateoffer(self, classn_pract, prof_sched_info, professorname):
        """Creates a new Offer object with the informations about the class
        number and practical (classn_pract), the professor and the schedules
        (prof_sched_info), note that the professor that will be associated with
        this offer has its name in professorname"""
        professor = self.checkprofessor(professorname)
        offer = Offer(self.timeperiod, self.course, classn_pract[0],
                      classn_pract[1], professor)
        if not isinstance(prof_sched_info[1], list):
            prof_sched_info[1] = [prof_sched_info[1]]
        for sched in prof_sched_info[1]:
            self.checkschedule(sched)
        offer.setSchedules(prof_sched_info[1])
        return offer

    def createoffers(self, offersdata):
        "Returns a list of offers to treat"
        offerslist = []
        for offerdata in offersdata:
            classn_pract = getclassnunmberandpractical(offerdata[0])
            prof_sched = getprofessorandschedule(offerdata[1])
            if not prof_sched:
                continue
            prof_sched = organize(prof_sched)
            for prof_sched_info in prof_sched:
                for professorname in prof_sched_info[0]:
                    offer = self.generateoffer(classn_pract, prof_sched_info,
                                               professorname)
                    offerslist.append(offer)
        return offerslist

    def getrelevantdata(self):
        """Returns a list of tuple with tag objects, each one containing
        information of one offer related to a course. The first element
        of the tuple has the classnumber and practical information, the
        second element has the professor and schedule data"""
        tables = self.crawler.htmlpage.findAll('table')[1]
        tables = tables.findAll('table')[0]
        tables = tables.findAll('table')[5]
        tables = tables.findAll('table')
        for table in tables:
            if re.match('[\S\s]*Didáticas[\S\s]*', str(table)):
                tables.remove(table)
        # Some irrelevant data will appear with strides of size 3 in this list
        tables = removefromlistwithstridesize(tables, 2, 2)
        return group(tables, 2)

示例#20

0

显示文件

文件： FriExample.py 项目： jgombac/DependencyDiff

    def __init__(self):
        db = get_session()
        truncate_db(db)

        config = ProjectConfig("A:/Development/magistrska/DependencyDiff/configs/fri.json", "A:/Development/magistrska/DependencyDiff/configs/fri.results.json")

        for old, new in zip(self.examples, self.examples[1::]):
            self.old_hash, self.old_url = old
            self.new_hash, self.new_url = new

            old_page = Page.get_or_create(db, self.project_name, self.old_hash, "https://www.fri.uni-lj.si/en/")
            old_page.url = self.old_url
            new_page = Page.get_or_create(db, self.project_name, self.new_hash, self.new_url)

            old_crawler = Crawler(old_page, "443")
            new_crawler = Crawler(new_page, "443")

            old_crawler.get_page(db, old_page, config, self.old_hash)
            new_crawler.get_page(db, new_page, config, self.new_hash)

            old_content = old_page.contents[0].content if len(old_page.contents) > 0 else ""
            new_content = new_page.contents[0].content if len(new_page.contents) > 0 else ""

            compare_result = Compare.compare(old_content, new_content)

            if compare_result:
                element_diff = Compare.extract_differences(compare_result)
                for element in element_diff:
                    old_diff = old_crawler.screenshot(element)
                    new_diff = new_crawler.screenshot(element)
                    DbDiff.get_or_create(db, old_page.id, new_page.id, element, old_diff[1], new_diff[1], old_diff[0],
                                         new_diff[0])

            for old_action in old_page.actions:
                for new_action in new_page.actions:
                    if old_action.element == new_action.element and old_action.type == new_action.type:
                        old_action_content = old_action.content
                        new_action_content = new_action.content

                        if old_content != old_action_content or new_content != new_action_content:

                            compare_result = Compare.compare(old_action_content, new_action_content)
                            if compare_result:
                                element_diff = Compare.extract_differences(compare_result)

                                if element_diff:
                                    old_crawler.visit_and_action(old_page, old_action, config)
                                    new_crawler.visit_and_action(new_page, new_action, config)
                                    for element in element_diff:
                                        old_diff = old_crawler.screenshot(element)
                                        new_diff = new_crawler.screenshot(element)
                                        DbDiff.get_or_create(db, old_page.id, new_page.id, element, old_diff[1],
                                                             new_diff[1], old_diff[0], new_diff[0], new_action.id)

            old_page.url = "https://www.fri.uni-lj.si/en/"
            new_page.url = "https://www.fri.uni-lj.si/en/"

            db.commit()

示例#21

0

显示文件

文件： Drei.py 项目： stehrenberg/HomeAutomation

if __name__ == '__main__':
    logger = Logger()
    logger.log(Logger.INFO, "Starting Drei")
    print("Starting Drei")

    # Initializing database with tables (if necessary)
    InitTables.main()

    # Initialize the message queue for crawler and manager
    manager_queue = multiprocessing.Queue()

    # Initialize the message queue for manager and webserver
    manager_webserver_queue = multiprocessing.Queue()

    # Start the crawler
    crawler = Crawler(manager_queue, platform.machine() != Const.PI_PLATFORM)
    crawler.start()

    # Start the manager
    manager = Manager(manager_queue, manager_webserver_queue)
    manager.start()

    # Start the webserver
    webserver = Webserver(manager_webserver_queue, manager_queue)
    webserver.start()

    # Wait until both processes end
    manager_queue.close()
    manager_queue.join_thread()
    crawler.join()
    manager.join()

示例#22

0

显示文件

文件： FacultyReader.py 项目： SuperNovaPOLIUSP/supernova

 def __init__(self):
     self.connection = None
     self.timeperiod = None
     self.faculty = None
     self.crawler = Crawler()

示例#23

0

显示文件

文件： TasteCrawler.py 项目： pharmajoe990/recipe_scraper

 def __init__(self, base_page, crawl_depth, children):
 		Crawler.__init__(self, base_page, crawl_depth, children)
     # super(self).__init__()
     # super.__init__(self, base_page, crawl_depth, children)
   		self.scrapable_urls = []

示例#24

0

显示文件

文件： OfferReader.py 项目： SuperNovaPOLIUSP/supernova

 def __init__(self, connection, cycle, timeperiod):
     self.connection = connection
     self.timeperiod = timeperiod
     self.cycle = cycle
     self.crawler = Crawler()
     self.course = None

示例#25

0

显示文件

文件： OfferReader.py 项目： SuperNovaPOLIUSP/supernova

class OfferReader(object):
    """A reader object which will use a crawler to scan through the page
    describing the offers of a course"""

    def __init__(self, connection, cycle, timeperiod):
        self.connection = connection
        self.timeperiod = timeperiod
        self.cycle = cycle
        self.crawler = Crawler()
        self.course = None

    def setcourse(self, course):
        "Sets the course which will be scanned"
        if not isinstance(course, Course):
            raise Exception('Invalid course parameter')
        self.course = course

    def scancourseofferpage(self):
        "Scans the page of the course, adding new offers to the bank"
        urlstart = 'https://uspdigital.usp.br/jupiterweb/obterTurma'
        fullurl = appendparameters(urlstart, {'sgldis':
                                              self.course.courseCode})
        self.crawler.loadpage(fullurl)
        offers = []
        if re.match('[\S\s]*Hor&aacute;rio[\S\s]*',
                    str(self.crawler.htmlpage)):
            offersdata = self.getrelevantdata()
            offers.extend(self.createoffers(offersdata))
        return offers

    def checkprofessor(self, professorname):
        """Checks if the professor with the name passed as argument is already
        in the bank and, if it isn't, stores a new professor. After that,
        returns the Professor object. If the professor name is empty, there is
        a special entry in the bank for this case """
        professor = Professor.find(name_equal=professorname)
        if not professor:
            if not professorname:
                professor = Professor.find(name_equal=NO_PROFESSOR_STRING)[0]
            else:
                professor = Professor(professorname)
                professor.store()
        else:
            professor = professor[0]
        return professor

    def checkschedule(self, sched):
        """Checks if the schedule object passed as sched is already in the
        bank and, if it isn't, stores a new schedule"""
        if not sched.idSchedule:
            sched.store()

    def generateoffer(self, classn_pract, prof_sched_info, professorname):
        """Creates a new Offer object with the informations about the class
        number and practical (classn_pract), the professor and the schedules
        (prof_sched_info), note that the professor that will be associated with
        this offer has its name in professorname"""
        professor = self.checkprofessor(professorname)
        offer = Offer(self.timeperiod, self.course, classn_pract[0],
                      classn_pract[1],
                      professor)
        if not isinstance(prof_sched_info[1], list):
            prof_sched_info[1] = [prof_sched_info[1]]
        for sched in prof_sched_info[1]:
            self.checkschedule(sched)
        offer.setSchedules(prof_sched_info[1])
        return offer

    def createoffers(self, offersdata):
        "Returns a list of offers to treat"
        offerslist = []
        for offerdata in offersdata:
            classn_pract = getclassnunmberandpractical(offerdata[0])
            prof_sched = getprofessorandschedule(offerdata[1])
            if not prof_sched:
                continue
            prof_sched = organize(prof_sched)
            for prof_sched_info in prof_sched:
                for professorname in prof_sched_info[0]:
                    offer = self.generateoffer(classn_pract, prof_sched_info,
                                               professorname)
                    offerslist.append(offer)
        return offerslist

    def getrelevantdata(self):
        """Returns a list of tuple with tag objects, each one containing
        information of one offer related to a course. The first element
        of the tuple has the classnumber and practical information, the
        second element has the professor and schedule data"""
        tables = self.crawler.htmlpage.findAll('table')[1]
        tables = tables.findAll('table')[0]
        tables = tables.findAll('table')[5]
        tables = tables.findAll('table')
        for table in tables:
            if re.match('[\S\s]*Didáticas[\S\s]*', str(table)):
                tables.remove(table)
        # Some irrelevant data will appear with strides of size 3 in this list
        tables = removefromlistwithstridesize(tables, 2, 2)
        return group(tables, 2)

示例#26

0

显示文件

 def __init__(self):
     self.connection = None
     self.timeperiod = None
     self.faculty = None
     self.crawler = Crawler()

示例#27

0

显示文件

    def compare_pages(self):
        success = self.setup_project(self.old.hash)
        if not success:
            print(f"failed diff deploying {self.old.hash}")
            return
        success = self.setup_project(self.new.hash)
        if not success:
            print(f"failed diff deploying {self.new.hash}")
            return

        for old in self.old.pages:
            for new in self.new.pages:
                if new.url == old.url:
                    print("diff pages: ", new.url, old.url)
                    old_crawler = None
                    new_crawler = None
                    old_content = old.contents[0].content if len(
                        old.contents) > 0 else ""
                    new_content = new.contents[0].content if len(
                        new.contents) > 0 else ""

                    exists = PageDiff.exists(self.db, old.id, new.id)

                    if not exists:
                        compare_result = Compare.compare(
                            old_content, new_content)
                        if compare_result:
                            PageDiff.get_or_create(self.db, old.id, new.id,
                                                   compare_result)

                            element_diff = Compare.extract_differences(
                                compare_result)

                            if element_diff:
                                old_crawler = Crawler(
                                    Page.get_or_create(
                                        self.db,
                                        self.project_config.project_name,
                                        self.old.hash,
                                        Url.clean_url(Constants.DOCKER_URL)),
                                    self.projects[self.old.hash].port)
                                new_crawler = Crawler(
                                    Page.get_or_create(
                                        self.db,
                                        self.project_config.project_name,
                                        self.new.hash,
                                        Url.clean_url(Constants.DOCKER_URL)),
                                    self.projects[self.new.hash].port)

                                old_crawler.visit_page(old,
                                                       self.project_config)
                                new_crawler.visit_page(new,
                                                       self.project_config)

                                for element in element_diff:
                                    old_diff = old_crawler.screenshot(element)
                                    new_diff = new_crawler.screenshot(element)
                                    DbDiff.get_or_create(
                                        self.db, old.id, new.id, element,
                                        old_diff[1], new_diff[1], old_diff[0],
                                        new_diff[0])

                    if old_crawler is None:
                        old_crawler = Crawler(
                            Page.get_or_create(
                                self.db, self.project_config.project_name,
                                self.old.hash,
                                Url.clean_url(Constants.DOCKER_URL)),
                            self.projects[self.old.hash].port)
                    if new_crawler is None:
                        new_crawler = Crawler(
                            Page.get_or_create(
                                self.db, self.project_config.project_name,
                                self.new.hash,
                                Url.clean_url(Constants.DOCKER_URL)),
                            self.projects[self.new.hash].port)

                    visited = False
                    for old_action in self.windowed_query(
                            old.actions, Action.id, 1000):
                        for new_action in self.windowed_query(
                                new.actions, Action.id, 10):
                            if old_action.element == new_action.element and old_action.type == new_action.type:
                                old_action_content = old_action.content
                                new_action_content = new_action.content

                                if old_content != old_action_content or new_content != new_action_content:
                                    exists = DbDiff.exists(
                                        self.db, old.id, new.id, new_action.id)
                                    if not exists:
                                        compare_result = Compare.compare(
                                            old_action_content,
                                            new_action_content)
                                        if compare_result:
                                            element_diff = Compare.extract_differences(
                                                compare_result)

                                            if element_diff:
                                                old_crawler.visit_and_action(
                                                    old, old_action,
                                                    self.project_config)
                                                new_crawler.visit_and_action(
                                                    old, new_action,
                                                    self.project_config)
                                                for element in element_diff:
                                                    old_diff = old_crawler.screenshot(
                                                        element)
                                                    new_diff = new_crawler.screenshot(
                                                        element)
                                                    DbDiff.get_or_create(
                                                        self.db, old.id,
                                                        new.id, element,
                                                        old_diff[1],
                                                        new_diff[1],
                                                        old_diff[0],
                                                        new_diff[0],
                                                        new_action.id)

        return True

示例#28

0

显示文件

文件： CrawlerTest.py 项目： Inquizarus/pycrawler

 def testItCanCrawl(self):
     crawler = Crawler()
     crawler.add_to_crawl('http://google.se')
     crawler.crawl()

示例#29

0

显示文件

文件： CycleReader.py 项目： SuperNovaPOLIUSP/supernova

class CycleReader(object):
    """A reader object which will use a crawler to scan through the page
    of a cycle"""
    def __init__(self):
        self.connection = MySQLConnection()
        self.timeperiod = None
        self.cycle = None
        self.coursereader = None
        self.offerreader = None
        self.crawler = Crawler()
        self.offerreader = OfferReader(self.connection, self.cycle,
                                       self.timeperiod)

    def settimeperiod(self, idtimeperiod):
        "Sets the timeperiod of this cycle by providing its id"
        self.timeperiod = idtimeperiod
        self.offerreader.timeperiod = TimePeriod.pickById(idtimeperiod)

    def setcycle(self, idcycle):
        "Sets the cycle of this reader by searching for the cycle in the bank"
        self.cycle = Cycle.pickById(idcycle)
        self.offerreader.cycle = self.cycle

    def startreadingcycles(self):
        """Starts scanning through the Cycle's page and iterates through each
        of it's courses. This function will not read 'Ciclo Básico' or 'Grande
        Área'"""
        urlstart = 'https://uspdigital.usp.br/jupiterweb/listarGradeCurricular'
        codcg = self.findidfaculty()
        codcurlist = self.findcodcur()
        codhab = str(self.cycle.cycleCode)
        for codcur in codcurlist:
            parameters = {'codcg': str(codcg), 'codcur': str(codcur), 'codhab':
                          codhab, 'tipo': 'N'}
            completeurl = appendparameters(urlstart, parameters)
            self.crawler.loadpage(completeurl)
            coursecodedata = self.crawler.htmlpage.findAll('table', {})
            coursecodedata = coursecodedata[1]  # The table with the courses
            codes = getcoursecodes(coursecodedata)
            if codes:
                break
        return self.startreadingoffers(codes)

    def findidfaculty(self):
        "returns the id of the faculty that has this cycle"
        relations = 'rel_courseCoordination_cycle r1, '\
            'rel_courseCoordination_faculty r2 '
        conditions = 'WHERE r1.idCourseCoordination = r2.idCourseCoordination'\
            ' AND r1.idCycle = ' + str(self.cycle.idCycle)
        query = 'SELECT idFaculty FROM ' + relations + conditions
        results = self.connection.execute(query)
        try:
            idfaculty = results[0][0]
        except IndexError:
            raise IndexError('Não conseguiu achar idFaculty,\
                             checar rel_courseCoordination_cycle e\
                             rel_courseCoordination_faculty com idCycle = ' +
                             str(self.cycle.idCycle))
            sys.exit()
        return idfaculty

    def findcodcur(self):
        """returns the idAcademicProgram representing the code for this
        object's cycle"""
        query = 'SELECT idAcademicProgram FROM rel_academicProgram_cycle '\
                'WHERE idCycle = ' + str(self.cycle.idCycle)
        codcurall = self.connection.execute(query)
        listcodcur = []
        for codcur in codcurall:
            listcodcur.append(codcur[0])
        return listcodcur

    def startreadingoffers(self, coursecodes):
        "Using the coursecodes list, reads all the offers from each code"
        requisitiontypetranslationdict = {0: 1, 1: 2, 2: 3}
        # In the bank: 1 - Obrigatória, 2 - Eletiva, 3 - Livre
        index = 0
        offers = []
        while index < len(coursecodes):
            for period in coursecodes[index]:
                for code in coursecodes[index][period]:
                    if code in COURSES_TO_IGNORE:
                        continue
                    reader = CourseReader(code, self.connection, self.cycle,
                                          period,
                                          requisitiontypetranslationdict
                                          [index])
                    course = reader.scancoursepage()
                    self.offerreader.setcourse(course)
                    offers.extend(self.offerreader.scancourseofferpage())
            index += 1
        return offers