def load_characters(neighbours, blur_scale, verbose=0): chars_file = 'characters_%s_%s.dat' % (blur_scale, neighbours) if exists(chars_file): print 'Loading characters...' chars = fload(chars_file) else: print 'Going to generate character objects...' chars = [] for char in sorted(listdir(IMAGES_FOLDER)): count = 0 for image in sorted(listdir(IMAGES_FOLDER + char)): image = GrayscaleImage(IMAGES_FOLDER + char + '/' + image) norm = NormalizedCharacterImage(image, blur=blur_scale, \ height=NORMALIZED_HEIGHT) character = Character(char, [], norm) character.get_single_cell_feature_vector(neighbours) chars.append(character) count += 1 if verbose: print 'Loaded character %s %d times' % (char, count) if verbose: print 'Saving characters...' fdump(chars, chars_file) return chars
def load_classifier(neighbours, blur_scale, c=None, gamma=None, verbose=0): classifier_file = 'classifier_%s_%s.dat' \ % (blur_scale, neighbours) classifier_path = DATA_FOLDER + classifier_file if exists(classifier_file): if verbose: print 'Loading classifier...' classifier = Classifier(filename=classifier_path, \ neighbours=neighbours, verbose=verbose) elif c != None and gamma != None: if verbose: print 'Training new classifier...' classifier = Classifier(c=c, gamma=gamma, neighbours=neighbours, \ verbose=verbose) learning_set = load_learning_set(neighbours, blur_scale, \ verbose=verbose) classifier.train(learning_set) classifier.save(classifier_path) else: raise Exception('No soft margin and gamma specified.') return classifier
def load_test_set(neighbours, blur_scale, verbose=0): test_set_file = 'test_set_%s_%s.dat' % (blur_scale, neighbours) if exists(test_set_file): if verbose: print 'Loading test set...' test_set = fload(test_set_file) if verbose: print 'Test set:', [c.value for c in test_set] else: test_set = generate_sets(neighbours, blur_scale, verbose=verbose)[1] return test_set
def load_learning_set(neighbours, blur_scale, verbose=0): learning_set_file = 'learning_set_%s_%s.dat' % (blur_scale, neighbours) if exists(learning_set_file): if verbose: print 'Loading learning set...' learning_set = fload(learning_set_file) if verbose: print 'Learning set:', [c.value for c in learning_set] else: learning_set = generate_sets(neighbours, blur_scale, \ verbose=verbose)[0] return learning_set
import data import sys start_urls = "http://www.azlyrics.com/lyrics/shakira/empire.html" if (data.exists(start_urls)): sys.exit(data.exists(start_urls)) else: from scrapy.spider import Spider from scrapy.selector import Selector from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from lyrics_az.items import LyricsAzItem class LyricsSpiderClass(Spider): name = "lyrics" allowed_domains = ["www.azlyrics.com"] start_urls = [ "http://www.azlyrics.com/lyrics/shakira/empire.html", ] def parse(self, response): sel = Selector(response) sites = sel.xpath('//div[@id="main"]') items = [] for site in sites: item = {} item['lyrics'] = "".join(site.xpath('div/text()').extract()) item['Artist'] = "".join(site.xpath('h2/text()').extract()) item['Song_name'] = "".join(site.xpath('b/text()').extract()) if (len(item['lyrics']) != 0):
import data import sys start_urls = "http://www.goodreads.com/quotes" if (data.exists(start_urls)): sys.exit(data.exists(start_urls)) else: from scrapy.spider import Spider from scrapy.selector import Selector from scrapy.contrib.linkextractors.sgml import SgmlLinkExtractor from scrapy.selector import HtmlXPathSelector from scraping_goodreads.items import ScrapingGoodreadsItem start_urls = "http://www.goodreads.com/quotes" class ScrapingGoodreadsSpider(Spider): name = "goodreads" allowed_domains = ["www.goodreads.com"] start_urls = "http://www.goodreads.com/quotes" def parse(self, response): sel = Selector(response) sites = sel.xpath('//div') items = [] for site in sites: item = {} item['body'] = str("".join(site.xpath('div[@class="quoteText"]/text()').extract()).encode('UTF-8')) item['author'] = str("".join(site.xpath('div[@class="quoteText"]/a/text()').extract()).encode('UTF-8')) item['work'] = str("".join(site.xpath('div[@class="quoteText"]/i/a/text()').extract()).encode('UTF-8')) if len(item['body']) != 0: items.append(item) data.saveQuotes(self.start_urls,self.allowed_domains,items) return 0