def crawl(self, directory=None, source=None, meta={}): source = source or directory source = Source.create({ 'foreign_id': 'directory:%s' % slugify(source), 'label': source }) db.session.commit() source_id = source.id if os.path.isfile(directory): self.crawl_file(source_id, directory, meta) directory = directory or os.getcwd() directory = directory.encode('utf-8') for (dirname, dirs, files) in os.walk(directory): dirparts = [ d for d in dirname.split(os.path.sep) if d in SKIP_DIRECTORIES ] if len(dirparts): continue log.info("Descending: %r", dirname) for file_name in files: dirname = string_value(dirname) file_name = string_value(file_name) if file_name in SKIP_FILES: continue file_path = os.path.join(dirname, file_name) self.crawl_file(source_id, file_path, meta)
def source(self): if not hasattr(self, '_source'): self._source = Source.create({ 'foreign_id': self.SOURCE_ID, 'label': self.SOURCE_LABEL or self.SOURCE_ID }) db.session.commit() return self._source
def crawl_source(self, engine, foreign_id, data): source = Source.create({ 'foreign_id': foreign_id, 'label': data.get('label') }) db.session.commit() meta_base = data.get('meta', {}) for name, query in data.get('queries', {}).items(): self.crawl_query(engine, source, meta_base, name, query)
def source(self): if not hasattr(self, '_source'): self._source = Source.create({ 'foreign_id': self.SOURCE_ID, 'label': self.SOURCE_LABEL or self.SOURCE_ID }) db.session.commit() db.session.add(self._source) return self._source
def crawl(self): for base_url in SITES: print 'Working on base_url: {}'.format(base_url) self.attributes = SITES[base_url] self.label = self.attributes['label'] Source.create({'label': self.label, 'foreign_id': 'blacklight'}) db.session.commit() self.failed_articles = 0 page_count = self.get_page_count(base_url) print "Pages: {}".format(page_count) page_number = 1 while (page_number <= page_count): if self.failed_articles >= FAILED_LIMIT: log.warning('Failure limit reach: {}'.format(FAILED_LIMIT)) break self.crawl_page(base_url, page_number, page_count) page_number += 1
def crawl(self): for base_url in SITES: print 'Working on base_url: {}'.format(base_url) self.attributes = SITES[base_url] self.label = self.attributes['label'] Source.create({ 'label': self.label, 'foreign_id': 'blacklight' }) db.session.commit() self.failed_articles = 0 page_count = self.get_page_count(base_url) print "Pages: {}".format(page_count) page_number = 1 while (page_number <= page_count): if self.failed_articles >= FAILED_LIMIT: log.warning('Failure limit reach: {}'.format(FAILED_LIMIT)) break self.crawl_page(base_url, page_number, page_count) page_number += 1
def crawl(self, directory=None, source=None): source = source or directory source = Source.create({ 'foreign_id': 'directory:%s' % slugify(source), 'label': source }) if os.path.isfile(directory): meta = self.metadata() meta.file_name = directory self.emit_file(source, meta, directory) directory = directory or os.getcwd() directory = directory.encode('utf-8') for (dirname, dirs, files) in os.walk(directory): log.info("Descending: %r", dirname) for file_name in files: if file_name in SKIP_FILES: continue file_path = os.path.join(dirname, file_name) if not os.path.isfile(file_path): continue try: meta = self.metadata() if isinstance(file_name, six.text_type): meta.source_path = file_path else: enc = chardet.detect(file_name) enc = enc.get('encoding') try: meta.source_path = file_path.decode(enc) except: meta.source_path = file_path.decode('ascii', 'ignore') self.emit_file(source, meta, file_path) except Exception as ex: log.exception(ex) process.exception(process.INDEX, component=self.name, source_location=directory, source_id=source.id, exception=ex)
def crawl_item(self, item, source): source_data = item.meta.get('source', {}) source_fk = source_data.pop('foreign_id', source) if source_fk is None: raise ValueError("No foreign_id for source given: %r" % item) if source_fk not in self.sources: label = source_data.get('label', source_fk) self.sources[source_fk] = Source.create({ 'foreign_id': source_fk, 'label': label }) if source_data.get('public'): Permission.grant_foreign(self.sources[source_fk], Role.SYSTEM_GUEST, True, False) db.session.commit() log.info('Import: %r', item.identifier) meta = self.normalize_metadata(item) ingest_file(self.sources[source_fk].id, meta, item.data_path, move=False)
def crawl(self, directory=None, source=None): source = source or directory source = Source.create({ 'foreign_id': 'directory:%s' % slugify(source), 'label': source }) if os.path.isfile(directory): meta = self.metadata() meta.file_name = directory self.emit_file(source, meta, directory) directory = directory or os.getcwd() directory = directory.encode('utf-8') for (dirname, dirs, files) in os.walk(directory): log.info("Descending: %r", dirname) for file_name in files: if file_name in SKIP_FILES: continue file_path = os.path.join(dirname, file_name) if not os.path.isfile(file_path): continue try: meta = self.metadata() if isinstance(file_name, six.text_type): meta.source_path = file_path else: enc = chardet.detect(file_name) enc = enc.get('encoding') try: meta.source_path = file_path.decode(enc) except: meta.source_path = file_path.decode('ascii', 'ignore') self.emit_file(source, meta, file_path) except Exception as ex: log.exception(ex)
def create(): authz.require(authz.logged_in()) src = Source.create(request_data(), current_user) db.session.commit() return view(src.slug)
def create_source(self, **data): if 'foreign_id' not in data: data['foreign_id'] = self.name return Source.create(data)
def create(): authz.require(authz.is_admin()) src = Source.create(request_data(), current_user) db.session.commit() return view(src.slug)