def headers(self, headers): self._headers = {} if not isinstance(headers, Mapping): return for key, value in headers.items(): key = slugify(key, sep='_') self._headers[key] = string_value(value)
def on_text(self, text): if text is None or len(text) <= 100: return try: hint_language_code = None if len(self.meta.languages) == 1: hint_language_code = self.meta.languages[0] text = Text(text, hint_language_code=hint_language_code) for entity in text.entities: if entity.tag == 'I-LOC': continue parts = [t for t in entity if t.lower() != t.upper()] if len(parts) < 2: continue name = ' '.join(parts) if len(name) < 5 or len(name) > 150: continue schema = SCHEMAS.get(entity.tag, DEFAULT_SCHEMA) fk = '%s:%s' % (self.origin, slugify(name)) self.entity_schemata[fk].append(schema) self.entity_names[fk] = name except ValueError as ve: log.info('NER value error: %r', ve) except Exception as ex: log.warning('NER failed: %r', ex)
def make_filename(file_name, sep='-'): if file_name is not None: file_name = os.path.basename(six.text_type(file_name)) slugs = [slugify(s, sep=sep) for s in file_name.rsplit('.', 1)] slugs = [s[:200] for s in slugs if s is not None] file_name = '.'.join(slugs) file_name = file_name.strip('.').strip(sep) file_name = six.text_type(file_name) if not len(file_name.strip()): file_name = None return file_name
def add_column(self, label): label = string_value(label) column = slugify(label or '', sep='_') column = column or 'column' column = column[:55] name, i = column, 2 # de-dupe: column, column_2, column_3, ... while name in [c.name for c in self.columns]: name = '%s_%s' % (name, i) i += 1 column = {'label': label, 'name': column} self.schema['columns'].append(column) return TabularColumn(self, column)
def crawl(self, directory=None, foreign_id=None, meta={}): directory = string_value(directory) if directory is None or not os.path.exists(directory): log.error("Invalid directory: %r", directory) return directory = os.path.abspath(os.path.normpath(directory)) collection = None if foreign_id is None: foreign_id = 'directory:%s' % slugify(directory) collection = self.load_collection({ 'foreign_id': foreign_id, 'label': directory, 'managed': True }) db.session.commit() meta = self.make_meta(meta) meta.source_path = directory ingest_directory(collection.id, meta, directory)
def headers(self): # normalize header names raw = self._headers or {} return {slugify(k, sep='_'): v for k, v in raw.items()}
def headers(self): raw = self.meta.get('headers', {}) return {slugify(k, sep='_'): v for k, v in raw.items()}