def _item_parser(self, item, test_url=None): """ Extract fields from source HTML or JSON to create Event from them :param item: part of source HTML or JSON which contains information about one event :param test_url: uses if method have download additional data from the page of concrete event and parse it :return: return list of Events, in most cases it will contain only one item, but if event has several dates, every date will have its own Event in the list """ logging.debug('Enter to the method, iter: %s', item) events = [] event = Event() # If we cannot get this features (title and url) - we won't work with this event further try: event.title = item['title'] event.url = 'https://kudago.com/' + item['location'][ 'slug'] + '/event/' + item['slug'] event.publication_date = item['publication_date'] except (KeyError, AttributeError): return [] # This extracting can spawn an exception try: event.description = item.get('description', None) event.price_kudago = item.get('price', None) event.price_min, event.price_max = self._get_price_from_string( item.get('price', ""), item.get('is_free', "")) event.categories = item.get('categories', []) + item.get( 'tags', []) if len(item.get('images', [])) > 0: event.image = item['images'][0]['image'] event.source_rating_count = item.get('favorites_count', None) except (KeyError, AttributeError): pass event.source = "KudaGo" event.status = "active" event.join_anytime = False # Complicate handling of dates dates = item.get('dates', []) if len(dates) > 1: # Firstly we will use unique identifier of the event - url # After saving to database we can use our own id event.duplicate_source_id = event.url for date_of_event in dates: event_for_date = copy.deepcopy(event) event_for_date.start_time = date_of_event['start'] event_for_date.finish_time = date_of_event['end'] events.append(event_for_date) if len(dates) == 0: event.start_time = 0 event.finish_time = 0 events.append(event) logging.debug('Events after item parsing: %s', events) return events
def _item_parser(self, item, test_url=None): """ Extract fields from source HTML or JSON to create Event from them and save to database :param item: :param test_url: uses if method have download additional data from the page of concrete event and parse it :return: return list of events, in most cases it will contain only one item, but if event has several dates, every date will have its own event in list """ logging.debug('Enter to the method, iter: %s', item) events = [] event = Event() # If we cannot get this features (title and url) - we won't work with this event further try: event.title = item['event']['title'] event.url = 'https://afisha.yandex.ru' + item['event']['url'] except (KeyError, AttributeError): return [] # This extracting can spawn an exception try: event.description = item.get('event', {}).get('argument', None) event.categories = self._get_all_categories(item.get('event', {}).get('systemTags', [])) event.image = item.get('event', {}).get('image', {}).get('eventCover', {}).get('url', None) event.source_rating_value = item.get('event', {}).get('userRating', {}).get('overall', {}).get('value', None) event.source_rating_count = item.get('event', {}).get('userRating', {}).get('overall', {}).get('count', None) except (KeyError, AttributeError): pass event.source = "YandexAfishaTheater" event.status = "active" event.join_anytime = False event.price_min, event.price_max = self._get_price_from_json(item) # We have one more url request for this event and extracting start_times for every date yandex_event_id = item.get('event', {}).get('id', None) time.sleep(1) # Add sleep to don't ddos attack source server start_times = self._get_start_times_of_the_event(yandex_event_id, test_url=test_url) # Complicate handling of dates if len(start_times) > 1: # Firstly we will use unique identifier of the event - url # After saving to database we can use our own id event.duplicate_source_id = event.url for start_time_of_event in start_times: event_session = copy.deepcopy(event) event_session.start_time = start_time_of_event event_session.finish_time = start_time_of_event events.append(event_session) if len(start_times) == 0: event.start_time = 0 event.finish_time = 0 events.append(event) logging.debug('Events after item parsing: %s', events) return events
def test_update_events_from_db(self, session, clear_data): # TODO: create events in abstract test class (remember that these events emulate events from database not from url) # Form event № 1 event_from_database_for_updating = Event() event_from_database_for_updating._id = 78 event_from_database_for_updating.source = "KudaGo" event_from_database_for_updating.title = "выставка Face 2 Face" event_from_database_for_updating.description = "Хотите лицом к лицу встретиться с героями известных фильмов? Добро пожаловать" event_from_database_for_updating.price_kudago = "от 0 до 650 рублей" event_from_database_for_updating.url = "https://kudago.com/msk/event/vyistavka-face-2-face" event_from_database_for_updating.categories = { "exhibition", "kids", "интерактивные", "новые технологии" } event_from_database_for_updating.image = "https://kudago.com/media/images/event/e1/6f/a5f5f.JPG" event_from_database_for_updating.start_time = 4695469200 event_from_database_for_updating.finish_time = 4699825200 event_from_database_for_updating.join_anytime = False event_from_database_for_updating.duplicate_source_id = "https://kudago.com/msk/event/vyistavka-face-2-face" event_from_database_for_updating.duplicate_id = 45 event_from_database_for_updating.price_min = 0 event_from_database_for_updating.price_max = 25 event_from_database_for_updating.source_rating_value = 2 event_from_database_for_updating.source_rating_count = 2 event_from_database_for_updating.status = "active" # Form event № 2 event_from_source_for_updating = Event() event_from_source_for_updating.source = "KudaGo" event_from_source_for_updating.title = "Обсуждение фильма Хоррор" event_from_source_for_updating.description = "Вот уж точно фильм, от которого волосы встают дыбом" event_from_source_for_updating.price_kudago = "500 рублей" event_from_source_for_updating.url = "https://kudago.com/msk/event/vyistavka-face-2-face" event_from_source_for_updating.categories = {"интересненькое", "опера"} event_from_source_for_updating.image = "https://kudago.com/media/images/event/e1/6f/5f.JPG" event_from_source_for_updating.start_time = 4695469200 event_from_source_for_updating.finish_time = 4699825200 event_from_source_for_updating.join_anytime = True event_from_source_for_updating.duplicate_source_id = "https://kudago.com/msk/event/vyistavka-face-2-face" event_from_source_for_updating.duplicate_id = 0 event_from_source_for_updating.price_min = 500 event_from_source_for_updating.price_max = 500 event_from_database_for_updating.source_rating_value = 2 event_from_database_for_updating.source_rating_count = 2 event_from_source_for_updating.status = "active" # Form event № 3 event_from_database_for_inactivating = Event() event_from_database_for_inactivating._id = 80 event_from_database_for_inactivating.source = "KudaGo" event_from_database_for_inactivating.url = "https://kudago.com/msk/event/vyistavka-face-2-face" event_from_database_for_inactivating.start_time = 4695469500 event_from_database_for_inactivating.finish_time = 4699825500 event_from_database_for_inactivating.duplicate_source_id = "https://kudago.com/msk/event/vyistavka-face-2-face" event_from_database_for_inactivating.duplicate_id = 45 event_from_database_for_inactivating.status = "active" # Form event № 4 event_from_source_for_adding = Event() event_from_source_for_adding.source = "KudaGo" event_from_source_for_adding.url = "https://kudago.com/msk/event/vyistavka-face-2-face" event_from_source_for_adding.start_time = 4695469900 event_from_source_for_adding.finish_time = 4699825900 event_from_source_for_adding.duplicate_source_id = "https://kudago.com/msk/event/vyistavka-face-2-face" event_from_source_for_adding.duplicate_id = 0 event_from_source_for_adding.status = "active" # Form lists for events as arguments for function under testing events = [ copy.deepcopy(event_from_source_for_updating), copy.deepcopy(event_from_source_for_adding) ] same_events_in_db = [ copy.deepcopy(event_from_database_for_updating), copy.deepcopy(event_from_database_for_inactivating) ] # Test our function result_list_of_events = KudaGoParser._update_events_from_db( same_events_in_db, events) self.check_equivalence_of_two_events(result_list_of_events[0], event_from_source_for_updating) self.check_equivalence_of_two_events(result_list_of_events[1], event_from_source_for_adding) event_from_database_for_inactivating.status = "hidden" self.check_equivalence_of_two_events( result_list_of_events[2], event_from_database_for_inactivating) assert len(result_list_of_events) == 3
def _item_parser(self, item, test_url=None): """ Extract fields from source HTML or JSON to create Event from them and save to database :param item: :param test_url: uses if method have download additional data from the page of concrete event and parse it :return: return list of events, in most cases it will contain only one item, but if event has several dates, every date will have its own event in list """ logging.debug('Enter to the method, iter: %s', item) events = [] event = Event() # If we cannot get this features (title and url) - we won't work with this event further try: event.title = item['event']['title'] event.url = 'https://afisha.yandex.ru' + item['event']['url'] except (KeyError, AttributeError): return [] # This extracting can spawn an exception try: event.description = item.get('event', {}).get('argument', None) event.categories = self._get_all_categories( item.get('event', {}).get('systemTags', [])) event.image = item.get('event', {}).get('image', {}).get('eventCover', {}).get('url', None) event.source_rating_value = item.get('event', {}).get( 'kinopoisk', {}).get('value', None) event.source_rating_count = item.get('event', {}).get( 'kinopoisk', {}).get('votes', None) except (KeyError, AttributeError): pass event.source = "YandexAfishaCinema" event.status = "active" # Now we write to the DB not all sessions with every film (too much every day) but only days # when this film is on screens in cinemas event.join_anytime = True # Complicate handling of dates dates = item.get('scheduleInfo', {}).get('dates', []) if len(dates) > 1: # Firstly we will use unique identifier of the event - url # After saving to database we can use our own id event.duplicate_source_id = event.url for date_of_event in dates: event_for_date = copy.deepcopy(event) date_parts = date_of_event.split( '-') # date_of_event = "2018-10-14" event_for_date.start_time = int( datetime.datetime(int(date_parts[0]), int(date_parts[1]), int(date_parts[2]), tzinfo=datetime.timezone.utc).timestamp()) event_for_date.finish_time = event_for_date.start_time + 86400 - 1 # one full day in seconds without 1 second events.append(event_for_date) if len(dates) == 0: event.start_time = 0 event.finish_time = 0 events.append(event) logging.debug('Events after item parsing: %s', events) return events