Пример #1
0
 def open_spider(self, spider):
     # designate table and fields to populate
     if spider.name in ['techmeme']:
         self.table = 'du_agg_news'
         self.cols = ['title', 'link', 'blurb', 'src']
     elif spider.name in ['github']:
         self.table = 'du_agg_projects'
         self.cols = ['title', 'link', 'blurb', 'lang', 'updated', 'stars', 'forks', 'src']
     elif spider.name in ['coursera']:
         self.table = 'du_agg_courses'
         self.cols = ['title', 'link', 'blurb', 'school', 'school_link', 'course_date', 'course_length', 'src']
     elif spider.name in ['meetup']:
         self.table = 'du_agg_events'
         self.cols = ['title', 'link', 'blurb', 'host', 'location', 'event_date', 'event_time', 'src']
     else:
         utils.devlog('Cannot get database table for type %s' % spider.name, 'e')
         self.crawler.engine.close_spider(spider, 'Closed spider -- cannot get appropriate database table.')
         
     # connect to database
     try:
         urlparse.uses_netloc.append('postgres') # set parsing scheme
         url = urlparse.urlparse(os.environ['DATABASE_URL'])            
         self.conn = psycopg2.connect("dbname=%s user=%s password=%s host=%s " % (url.path[1:], url.username, url.password, url.hostname))                        
         self.cur = self.conn.cursor()
     except KeyError, e:
         utils.devlog("No DATABASE_URL is set - cannot get database connection information.", 'e')
         self.crawler.engine.close_spider(spider, 'Closed spider -- cannot get database connection information.')
Пример #2
0
 def process_item(self, item, spider):
     if item['link']:
         if item['link'] in self.pages_seen:
             utils.devlog("Link '%s' is a duplicate!" % item['link'], 'w')
             raise DropItem("Duplicate link found: %s" % item)
         else:
             self.pages_seen.add(item['link'])
     return item
Пример #3
0
    def parseItem(self, response):
        utils.devlog("Beginning on new item...")

        hxs = HtmlXPathSelector(response)        
        item = EventItem()
        item['link'] = response.url
        item['title'] = hxs.select('//div[contains(@id, "event-title")]/@data-name').extract()[0]
        item['event_date'] = "BLAH" if not hxs.select('//div[contains(@id, "event-content")]//li[contains(@id, "event-when")]//time[contains(@id, "event-start-time")]/p[1]/text()').extract() else "HAPPY"
        #item['location'] = hxs.select('//div[contains(@id, "event-content")]//li[contains(@id, "event-where")]/@data-name').extract()[0]
        #item['location'] += hxs.select('//div[contains(@id, "event-content")]//li[contains(@id, "event-where")]/@data-address').extract()[0]
        #item['blurb'] += hxs.select('//div[contains(@id, "event-content")]//li[contains(@id, "event-desc")]//p/text()').extract()[0]
        #item['host'] = hxs.select('//div/a[contains(@class, "chapter-name")]/text()').extract()[0]
        item['src'] = "events"
        
        self.item_cnt += 1
        yield item
Пример #4
0
    def parse(self, response):
        courses = json.loads(response.body) # grab the JSON from the API search
        for course in courses:
            if self.cats[0] in course['category-ids']:
                item = CourseItem()
                item['title'] = course['name']
                item['blurb'] = course['short_description']
                item['link'] = urlparse.urljoin("https://www.coursera.org/course/", course['short_name'])
                item['school'] = course['universities'][0]['name']
                item['school_link'] = urlparse.urljoin("http://coursera.org/", course['universities'][0]['short_name'])
                item['course_date'] = "TBA" if course['courses'][0]['start_date_string'] in [None, ""] else course['courses'][0]['start_date_string']
                item['course_length'] = "TBA" if course['courses'][0]['duration_string'] in [None, ""] else course['courses'][0]['duration_string']
                item['src'] = "courses"

                self.item_cnt += 1
                yield item

        utils.devlog("All done... total items is %d" % self.item_cnt)
Пример #5
0
    def parse(self, response):
        utils.devlog('Parsing page... depth is %s' % response.meta['depth'])
        
        # grab info from projects on this page
        #@@@ stars and forks do not work!
        hxs = HtmlXPathSelector(response)        
        for project in hxs.select('//li[contains(@class, "public")]'):
            item = ProjectItem()
            item['blurb'] = "" if not project.select('.//p[contains(@class, "description")]/text()').extract() else project.select('.//p[contains(@class, "description")]/text()').extract()[0]
            item['title'] = project.select('./h3/a/text()').extract()[0]
            item['link'] = urlparse.urljoin("https://github.com", project.select('./h3/a/@href').extract()[0])
            item['lang'] = project.select('./ul/li[1]/text()').extract()[0]
            item['updated'] = project.select('.//p[contains(@class, "updated-at")]/time/@title').extract()[0]
            item['stars'] = project.select('.//li[contains(@class, "stargazers")]/a/@href').extract()[0]
            item['forks'] = project.select('.//li[contains(@class, "forks")]/a/text()').extract()[0]
            item['src'] = "projects"
            print "\nNUM STARS: %s" % item['stars']
            sys.exit(1)
            if item['stars'] == 0:
                print "NO STARS!!!!!!!! (%s)" % item['stars']
                sys.exit(1)                

            #print unicode(item['title']).encode('utf8')
            self.item_cnt += 1
            yield item

        # try to parse the next page
        try:
            nextPageLink = hxs.select('//div[contains(@class, "pagination")]/a[contains(@class, "next_page")]/@href').extract()[0]
            nextPageLink = urlparse.urljoin(response.url, nextPageLink)
            utils.devlog("Moving onto next page: link is %s" % nextPageLink)
            yield Request(nextPageLink, callback = self.parse)
        except:
            utils.devlog("I have reached the last page... total items is %d" % self.item_cnt)
Пример #6
0
    def process_item(self, item, spider):                
        try:
            # attempt to get get spider source
            self.cur.execute("select id from du_agg_sources where slug=%s;", (spider.name,)) 
            if not self.cur.rowcount:
                return item
            else:
                item['src'] = self.cur.fetchone()[0]         

            #@@@ attempt to get language ref if necessary
            if (spider.name == 'github') :
                item['stars'] = 0 if not item['stars'] else item['stars']
                item['forks'] = 0 if not item['stars'] else item['forks']
                self.cur.execute("select id from du_agg_languages where slug = %s or lower(title) = %s;", (item['lang'].lower(), item['lang'].lower())) 
                if not self.cur.rowcount:
                    self.bad += 1
                    return item
                else:                    
                    item['lang'] = self.cur.fetchone()[0]

            # populate dict with fields/vals
            keysvals = dict.fromkeys(self.cols)
            for col in keysvals:
                keysvals[col] = item[col]

            # generate SQL and insert into the DB
            datarep = ("%s," * len(keysvals)).rstrip(',')
            sql = "insert into %s (%s) values (%s);" % (self.table, ', '.join(keysvals.keys()), datarep)            
            self.cur.execute(sql, keysvals.values())
            #utils.devlog("QRY: %s" % self.cur.query)
            self.conn.commit() # maybe should commit in close_spider instead of for each item

        except psycopg2.DatabaseError, e:
            if self.conn:
                self.conn.rollback()
            self.bad += 1
            utils.devlog("Failed to store itemed entitled '%s' via %s spider: %s" % (item['title'], spider.name, e))
Пример #7
0
 def close_spider(self, spider):
     if self.conn:
         self.conn.close()
     utils.devlog("Number of items not stored: %s" % self.bad)
Пример #8
0
    def parse(self, response):                
        depth = response.meta['depth']
        utils.devlog('depth is %s' % depth)

        # when to quit!!
        if depth > 60:
            utils.devlog("I have crawled enough pages... total items is %d" % self.item_cnt)
            return

        # generate a cutoff date so we don't search for meetups in the distant future
        now = datetime.now()
        cutoff = now + timedelta(weeks=12)
        utils.devlog("The cutoff date is %s" % cutoff.strftime("%B %d, %Y"))

        # grab info from events on this page
        hxs = HtmlXPathSelector(response)        
        for event in hxs.select('//ul[contains(@class, "event-listing-container")]/li[contains(@class, "event-listing")]'):
            # stop at cutoff date so we don't get too many
            date = datetime(int(event.select('@data-year').extract()[0]), int(event.select('@data-month').extract()[0]), int(event.select('@data-day').extract()[0]))
            if date > cutoff:
                utils.devlog("I have reached the cutoff date... total items is %d" % self.item_cnt)
                return

            # not yet in use - pages are not standardized enough!            
            #link = event.select('./a[contains(@class, "list-time")]/@href').extract()[0]
            #yield Request(link, callback = self.parseItem)
            item = EventItem()            
            item['link'] = event.select('./a[contains(@class, "list-time")]/@href').extract()[0]
            item['event_date'] = date.strftime("%B %d, %Y")
            item['event_time'] = event.select('./a[contains(@class, "list-time")]/text()').extract()[0] 
            item['host'] = event.select('./div/a[contains(@class, "chapter-name")]/text()').extract()[0]
            item['title'] = event.select('./div/h4/a[contains(@class, "event-title")]/text()').extract()[0]
            item['blurb'] = ""                                
            item['location'] = "TBA"
            item['src'] = "events"
            self.item_cnt += 1
            yield item
            
        # try to parse the next page
        # There is a bug on the meetup site such that the Next link in the HTML is wrong.
        # Therefore I am generating the crawl link manually.
        try:
            p = re.compile('currentpage=([0-9]+)')
            currentPage = int(p.search(response.url).group(1))
            nextPage = currentPage + 1;
            offset = currentPage * 64
            nextPageLink = "http://www.meetup.com/find/?offset=%s&psize=64&currentpage=%s&categories=34&radius=10&userFreeform=San+Francisco&events=true&sort=default" % (offset, nextPage)
                        
            #nextPageLink = hxs.select('//div[contains(@class, "simple-infinite-pager")]/a/@href').extract()[0]
            #nextPageLink = urlparse.urljoin(response.url, nextPageLink)
            utils.devlog("Moving onto next page: offset is %s and nextpage is %s which yields link %s" % (offset, nextPage, nextPageLink))
            yield Request(nextPageLink, callback = self.parse)
        except:
            utils.devlog("Failed to fetch next page to crawl... total items is %d" % self.item_cnt)