def rename(docs, target=None, **kwargs): assert target, "To run this migration you need a target." for doc in docs: new_doc = copy.deepcopy(doc.raw.attributes) new_doc['source'] = target raw = RawDocument(new_doc, validate=False) assert doc.raw.attributes[ 'source'] != target, "Can't rename {} to {}, names are the same.".format( doc.raw['source'], target) if not kwargs.get('dry'): tasks.process_raw(raw) tasks.process_normalized(tasks.normalize(raw, raw['source']), raw) logger.info('Processed document from {} with id {}'.format( doc.raw.attributes['source'], raw['docID'])) es_processor = get_processor('elasticsearch') es_processor.manager.es.delete( index=settings.ELASTIC_INDEX, doc_type=doc.raw.attributes['source'], id=raw['docID'], ignore=[404]) es_processor.manager.es.delete( index='share_v1', doc_type=doc.raw.attributes['source'], id=raw['docID'], ignore=[404]) logger.info('Renamed document from {} to {} with id {}'.format( doc.raw.attributes['source'], target, raw['docID']))
def rename(docs, target=None, **kwargs): assert target, "To run this migration you need a target." for doc in docs: raw = RawDocument({ 'doc': doc.doc, 'docID': doc.docID, 'source': target, 'filetype': doc.filetype, 'timestamps': doc.timestamps, 'versions': doc.versions }) assert doc.source != target, "Can't rename {} to {}, names are the same.".format( doc.source, target) if not kwargs.get('dry'): tasks.process_raw(raw) tasks.process_normalized(tasks.normalize(raw, raw['source']), raw) logger.info('Processed document from {} with id {}'.format( doc.source, raw['docID'])) es.delete(index=settings.ELASTIC_INDEX, doc_type=doc.source, id=raw['docID'], ignore=[404]) es.delete(index='share_v1', doc_type=doc.source, id=raw['docID'], ignore=[404]) logger.info('Deleted document from {} with id {}'.format( doc.source, raw['docID']))
def rename(source, target, dry=True): assert source != target, "Can't rename {} to {}, names are the same".format(source, target) count = 0 exceptions = [] for doc in documents(source): count += 1 try: raw = RawDocument({ 'doc': doc.doc, 'docID': doc.docID, 'source': target, 'filetype': doc.filetype, 'timestamps': doc.timestamps, 'versions': doc.versions }) if not dry: process_raw(raw) process_normalized(normalize(raw, raw['source']), raw) logger.info('Processed document from {} with id {}'.format(source, raw['docID'])) except Exception as e: logger.exception(e) exceptions.append(e) else: if not dry: es.delete(index=settings.ELASTIC_INDEX, doc_type=source, id=raw['docID'], ignore=[404]) es.delete(index='share_v1', doc_type=source, id=raw['docID'], ignore=[404]) logger.info('Deleted document from {} with id {}'.format(source, raw['docID'])) if dry: logger.info('Dry run complete') for ex in exceptions: logger.exception(e) logger.info('{} documents processed, with {} exceptions'.format(count, len(exceptions)))
def process_one(harvester_name, harvester, raw_path): date = parser.parse(raw_path.split('/')[-2]) timestamp = date.isoformat() raw_file = store.get_as_string(raw_path) raw_doc = RawDocument({ 'doc': raw_file, 'timestamps': { 'harvestFinished': timestamp }, 'docID': b64decode(raw_path.split('/')[-3]).decode('utf-8'), 'source': harvester_name, 'filetype': harvester['fileFormat'], }) try: raw_list = raw_path.split('/') raw_list[-1] = 'normalized.json' normalized_path = '/'.join(raw_list) with open(normalized_path, 'r') as f: normalized = NormalizedDocument(json.load(f)) except Exception: normalized = normalize(raw_doc, harvester_name) ( process_to_cassandra.si(raw_doc, normalized) | process_to_elasticsearch.si(raw_doc, normalized) | move_to_backup.si(raw_path) ).apply_async()
def renormalize(doc, **kwargs): raw = RawDocument({ 'doc': doc.doc, 'docID': doc.docID, 'source': doc.source, 'filetype': doc.filetype, 'timestamps': doc.timestamps, 'versions': doc.versions }) if not kwargs.get('dry'): tasks.process_normalized(tasks.normalize(raw, raw['source']), raw)
def rename(docs, target=None, **kwargs): assert target, "To run this migration you need a target." for doc in docs: new_doc = copy.deepcopy(doc.raw.attributes) new_doc['source'] = target raw = RawDocument(new_doc, validate=False) assert doc.raw.attributes['source'] != target, "Can't rename {} to {}, names are the same.".format(doc.raw['source'], target) if not kwargs.get('dry'): tasks.process_raw(raw) tasks.process_normalized(tasks.normalize(raw, raw['source']), raw) logger.info('Processed document from {} with id {}'.format(doc.raw.attributes['source'], raw['docID'])) es_processor = get_processor('elasticsearch') es_processor.manager.es.delete(index=settings.ELASTIC_INDEX, doc_type=doc.raw.attributes['source'], id=raw['docID'], ignore=[404]) es_processor.manager.es.delete(index='share_v1', doc_type=doc.raw.attributes['source'], id=raw['docID'], ignore=[404]) logger.info('Renamed document from {} to {} with id {}'.format(doc.raw.attributes['source'], target, raw['docID']))
def renormalize(sources=()): count = 0 exceptions = [] for doc in documents(*sources): count += 1 try: raw = RawDocument({ 'doc': doc.doc, 'docID': doc.docID, 'source': doc.source, 'filetype': doc.filetype, 'timestamps': doc.timestamps, 'versions': doc.versions }) process_normalized(normalize(raw, raw['source']), raw) except Exception as e: logger.exception(e) exceptions.append(e) for ex in exceptions: logger.exception(e) logger.info('{} documents processed, with {} exceptions'.format(count, len(exceptions)))
def rename(doc, target=None, **kwargs): assert target, "To run this migration you need a target." raw = RawDocument({ 'doc': doc.doc, 'docID': doc.docID, 'source': target, 'filetype': doc.filetype, 'timestamps': doc.timestamps, 'versions': doc.versions }) assert doc.source != target, "Can't rename {} to {}, names are the same.".format(doc.source, target) if not kwargs.get('dry'): tasks.process_raw(raw) tasks.process_normalized(tasks.normalize(raw, raw['source']), raw) logger.info('Processed document from {} with id {}'.format(doc.source, raw['docID'])) es.delete(index=settings.ELASTIC_INDEX, doc_type=doc.source, id=raw['docID'], ignore=[404]) es.delete(index='share_v1', doc_type=doc.source, id=raw['docID'], ignore=[404]) logger.info('Deleted document from {} with id {}'.format(doc.source, raw['docID']))
def main(): for raw in document_generator(): try: process_normalized(normalize(raw, raw['source']), raw) except Exception as e: logger.exception(e)
def renormalize(docs, *args, **kwargs): for doc in docs: if not kwargs.get('dry'): tasks.process_normalized( tasks.normalize(doc.raw, doc.raw['source']), doc.raw)
def renormalize(docs, *args, **kwargs): for doc in docs: if not kwargs.get('dry'): tasks.process_normalized(tasks.normalize(doc.raw, doc.raw['source']), doc.raw)