def load_all(self, kwargs_key): """ Do the work. """ start = time.time() try: # create a session specific to this task session = gen_session() # get the inputs from redis kwargs = self.redis.get(kwargs_key) if not kwargs: raise InternalServerError( 'An unexpected error occurred while processing bulk upload.' ) kwargs = pickle_to_obj(kwargs) data = kwargs.get('data') kw = kwargs.get('kw') # delete them self.redis.delete(kwargs_key) outputs = [] errors = [] fx = partial(self._load_one, **kw) if self.concurrent: pool = Pool(min([len(data), self.max_workers])) for res in pool.imap_unordered(fx, data): if isinstance(res, Exception): errors.append(res) else: outputs.append(res) else: for item in data: res = fx(item) if isinstance(res, Exception): errors.append(res) else: outputs.append(res) # return errors if len(errors): self._handle_errors(errors) # add objects and execute if self.returns == 'model': for o in outputs: if o is not None: try: session.add(o) session.commit(o) except Exception as e: self._handle_errors(e) # union all queries elif self.returns == 'query': for query in outputs: if query is not None: try: session.execute(query) except Exception as e: self._handle_errors(e) try: session.commit() except Exception as e: session.rollback() session.remove() self._handle_errors(e) # return true if everything worked. session.close() return True except JobTimeoutException: end = time.time() return InternalServerError( 'Bulk loading timed out after {} seconds' .format(end-start))
def set_session(self): if hasattr(self, 'session'): self.session.close() self.session = gen_session()
def ingest(obj, org_id, url_fields=['body'], requires=['url', 'type'], extract=True, kill_session=True): """ Ingest an Event. """ # distinct session for this eventlet. session = gen_session() # check required fields ingest_util.check_requires(obj, requires, type='Content Item') # validate type validate_content_item_types(obj['type']) # check if the org_id is in the body # TODO: I don't think this is necessary. org_id = obj.pop('org_id', org_id) # get rid of ``id`` if it somehow got in here. obj.pop('id', None) # normalize the url obj['url'] = ingest_util.prepare_url(obj, 'url') # run article extraction. if extract: cache_response = extract_cache.get(url=obj['url'], type=obj['type']) if not cache_response: # make sure to kill this key. extract_cache.invalidate(url=obj['url'], type=obj['type']) raise RequestError( 'Extraction failed on {type} - {url}'.format(**obj)) # extraction succeeded else: data = cache_response.value obj.update(data) else: obj['title'] = ingest_util.prepare_str(obj, 'title') obj['description'] = ingest_util.prepare_str(obj, 'description') obj['body'] = ingest_util.prepare_str(obj, 'body') obj['created'] = ingest_util.prepare_str(obj, 'created') if not obj['created']: obj.pop('created') # get thumbnail obj['thumbnail'] = ingest_util.prepare_thumbnail(obj, 'img_url') # split out tags_ids + authors + links tag_ids = obj.pop('tag_ids', []) authors = obj.pop('author_ids', []) authors.extend(obj.pop('authors', [])) # accept names too # links = obj.pop('links', {}) # determine event provenance obj = _content_item_provenance(obj, org_id) # split out meta fields obj = ingest_util.split_meta(obj, get_table_columns(ContentItem)) # see if the event already exists. c = session.query(ContentItem)\ .filter_by(org_id=org_id, type=obj['type'], url=obj['url'])\ .first() # if not, create it if not c: # create event c = ContentItem(org_id=org_id, **obj) # else, update it else: for k, v in obj.items(): setattr(c, k, v) # extract urls and normalize urls asynchronously. # urls = ingest_util.extract_urls( # obj, # url_fields, # source=data.get('url'), # links=_links) # detect content_items # if len(_links): # c = _associate_content_items(c, org_id, _links) # associate tags if len(tag_ids): c = _associate_tags(c, org_id, tag_ids, session) # associate tags if len(authors): _authors = _associate_authors(c, org_id, authors, session) for a in _authors: if a.id not in c.author_ids: c.authors.append(a) session.add(c) session.commit() if kill_session: session.close() return c
def ingest( obj, org_id, url_fields=['body'], requires=['url', 'type'], extract=True, kill_session=True): """ Ingest an Event. """ # distinct session for this eventlet. session = gen_session() # check required fields ingest_util.check_requires(obj, requires, type='Content Item') # validate type validate_content_item_types(obj['type']) # check if the org_id is in the body # TODO: I don't think this is necessary. org_id = obj.pop('org_id', org_id) # get rid of ``id`` if it somehow got in here. obj.pop('id', None) # normalize the url obj['url'] = ingest_util.prepare_url(obj, 'url') # run article extraction. if extract: cache_response = extract_cache.get(url=obj['url'], type=obj['type']) if not cache_response: # make sure to kill this key. extract_cache.invalidate(url=obj['url'], type=obj['type']) raise RequestError( 'Extraction failed on {type} - {url}' .format(**obj)) # extraction succeeded else: data = cache_response.value obj.update(data) else: obj['title'] = ingest_util.prepare_str(obj, 'title') obj['description'] = ingest_util.prepare_str(obj, 'description') obj['body'] = ingest_util.prepare_str(obj, 'body') obj['created'] = ingest_util.prepare_str(obj, 'created') if not obj['created']: obj.pop('created') # get thumbnail obj['thumbnail'] = ingest_util.prepare_thumbnail(obj, 'img_url') # split out tags_ids + authors + links tag_ids = obj.pop('tag_ids', []) authors = obj.pop('author_ids', []) authors.extend(obj.pop('authors', [])) # accept names too # links = obj.pop('links', {}) # determine event provenance obj = _content_item_provenance(obj, org_id) # split out meta fields obj = ingest_util.split_meta(obj, get_table_columns(ContentItem)) # see if the event already exists. c = session.query(ContentItem)\ .filter_by(org_id=org_id, type=obj['type'], url=obj['url'])\ .first() # if not, create it if not c: # create event c = ContentItem(org_id=org_id, **obj) # else, update it else: for k, v in obj.items(): setattr(c, k, v) # extract urls and normalize urls asynchronously. # urls = ingest_util.extract_urls( # obj, # url_fields, # source=data.get('url'), # links=_links) # detect content_items # if len(_links): # c = _associate_content_items(c, org_id, _links) # associate tags if len(tag_ids): c = _associate_tags(c, org_id, tag_ids, session) # associate tags if len(authors): _authors = _associate_authors(c, org_id, authors, session) for a in _authors: if a.id not in c.author_ids: c.authors.append(a) session.add(c) session.commit() if kill_session: session.close() return c
import unittest from newslynx.exc import SousChefSchemaError from newslynx.models import sous_chef_schema, SousChef from newslynx.core import gen_session db_session = gen_session() class TestSousChefJSONSchema(unittest.TestCase): def test_good_schema(self): sc = { "name": "Twitter List", "slug": "twitter-list", "description": "Extracts events from a twitter list.", "runs": "newslynx.sc.events.twitter.List", "creates": "events", "options": { "owner_screen_name": { "input_type": "text", "value_types": ["string"], "accepts_list": True, "required": True, "help": { "placeholder": "cspan" }, }, "min_followers": { "input_type": "number", "value_types": ["numeric"],
def set_session(self): self.session = gen_session()
def load_all(self, kwargs_key): """ Do the work. """ start = time.time() try: # create a session specific to this task session = gen_session() # get the inputs from redis kwargs = self.redis.get(kwargs_key) if not kwargs: raise InternalServerError( 'An unexpected error occurred while processing bulk upload.' ) kwargs = pickle_to_obj(kwargs) data = kwargs.get('data') kw = kwargs.get('kw') # delete them self.redis.delete(kwargs_key) outputs = [] errors = [] fx = partial(self._load_one, **kw) if self.concurrent: pool = Pool(min([len(data), self.max_workers])) for res in pool.imap_unordered(fx, data): if isinstance(res, Exception): errors.append(res) else: outputs.append(res) else: for item in data: res = fx(item) if isinstance(res, Exception): errors.append(res) else: outputs.append(res) # return errors if len(errors): self._handle_errors(errors) # add objects and execute if self.returns == 'model': for o in outputs: if o is not None: try: session.add(o) session.commit(o) except Exception as e: self._handle_errors(e) # union all queries elif self.returns == 'query': for query in outputs: if query is not None: try: session.execute(query) except Exception as e: self._handle_errors(e) try: session.commit() except Exception as e: session.rollback() session.remove() self._handle_errors(e) # return true if everything worked. session.close() return True except JobTimeoutException: end = time.time() return InternalServerError( 'Bulk loading timed out after {} seconds'.format(end - start))
def ingest( obj, org_id, org_domains, url_fields=['title', 'body', 'description'], requires=['title'], must_link=False, kill_session=True): """ Ingest an Event. """ # distinct session for this eventlet. session = gen_session() has_content_items = False # check required fields ingest_util.check_requires(obj, requires, type='Event') # validate status if 'status' in obj: validate_event_status(obj['status']) if obj['status'] == 'deleted': raise RequestError( 'You cannot create an Event with status "deleted."') # check if the org_id is in the body # TODO: I don't think this is necessary. org_id = obj.pop('org_id', org_id) # get rid of ``id`` if it somehow got in here. obj.pop('id', None) # normalize the url obj['url'] = ingest_util.prepare_url(obj, 'url') # sanitize creation date obj['created'] = ingest_util.prepare_date(obj, 'created') if not obj['created']: obj.pop('created') # sanitize text/html fields obj['title'] = ingest_util.prepare_str(obj, 'title', obj['url']) obj['description'] = ingest_util.prepare_str( obj, 'description', obj['url']) obj['body'] = ingest_util.prepare_str(obj, 'body', obj['url']) # get thumbnail obj['thumbnail'] = ingest_util.prepare_thumbnail(obj, 'img_url') # split out tags_ids + content_item_ids tag_ids = obj.pop('tag_ids', []) content_item_ids = obj.pop('content_item_ids', []) links = obj.pop('links', []) # determine event provenance obj = _event_provenance(obj, org_id, session) # split out meta fields obj = ingest_util.split_meta(obj, get_table_columns(Event)) # see if the event already exists. e = session.query(Event)\ .filter_by(org_id=org_id)\ .filter_by(source_id=obj['source_id'])\ .first() # if not, create it if not e: # create event e = Event(org_id=org_id, **obj) # else, update it else: # if it's deleted, issue a message. if e.status == 'deleted': raise UnprocessableEntityError( 'Event {} already exists and has been previously deleted.' .format(e.id)) for k, v in obj.items(): setattr(e, k, v) # extract urls and normalize urls asynchronously. links = ingest_util.prepare_links(links, org_domains) # detect content_items if len(links): e, has_content_items = _associate_content_items( e, org_id, links, content_item_ids, session) # associate tags if len(tag_ids): e = _associate_tags(e, org_id, tag_ids, session) # dont commit event if we're only looking # for events that link to content_items if not has_content_items and must_link: return None session.add(e) session.commit() if kill_session: session.close() return e
def ingest(obj, org_id, org_domains, url_fields=['title', 'body', 'description'], requires=['title'], must_link=False, kill_session=True): """ Ingest an Event. """ # distinct session for this eventlet. session = gen_session() has_content_items = False # check required fields ingest_util.check_requires(obj, requires, type='Event') # validate status if 'status' in obj: validate_event_status(obj['status']) if obj['status'] == 'deleted': raise RequestError( 'You cannot create an Event with status "deleted."') # check if the org_id is in the body # TODO: I don't think this is necessary. org_id = obj.pop('org_id', org_id) # get rid of ``id`` if it somehow got in here. obj.pop('id', None) # normalize the url obj['url'] = ingest_util.prepare_url(obj, 'url') # sanitize creation date obj['created'] = ingest_util.prepare_date(obj, 'created') if not obj['created']: obj.pop('created') # sanitize text/html fields obj['title'] = ingest_util.prepare_str(obj, 'title', obj['url']) obj['description'] = ingest_util.prepare_str(obj, 'description', obj['url']) obj['body'] = ingest_util.prepare_str(obj, 'body', obj['url']) # get thumbnail obj['thumbnail'] = ingest_util.prepare_thumbnail(obj, 'img_url') # split out tags_ids + content_item_ids tag_ids = obj.pop('tag_ids', []) content_item_ids = obj.pop('content_item_ids', []) links = obj.pop('links', []) # determine event provenance obj = _event_provenance(obj, org_id, session) # split out meta fields obj = ingest_util.split_meta(obj, get_table_columns(Event)) # see if the event already exists. e = session.query(Event)\ .filter_by(org_id=org_id)\ .filter_by(source_id=obj['source_id'])\ .first() # if not, create it if not e: # create event e = Event(org_id=org_id, **obj) # else, update it else: # if it's deleted, issue a message. if e.status == 'deleted': raise UnprocessableEntityError( 'Event {} already exists and has been previously deleted.'. format(e.id)) for k, v in obj.items(): setattr(e, k, v) # extract urls and normalize urls asynchronously. links = ingest_util.prepare_links(links, org_domains) # detect content_items if len(links): e, has_content_items = _associate_content_items( e, org_id, links, content_item_ids, session) # associate tags if len(tag_ids): e = _associate_tags(e, org_id, tag_ids, session) # dont commit event if we're only looking # for events that link to content_items if not has_content_items and must_link: return None session.add(e) session.commit() if kill_session: session.close() return e