def _set_fetch_params(params, now): fetch_params: Dict[str, Any] = {'meta[total]': True, 'stats': True} limit = arg_to_number( arg=demisto.params().get('max_fetch'), arg_name='max_fetch', required=False ) if limit is not None: if limit > 10000: demisto.info('Adjusting artifact limit to maximum of 10000') limit = 10000 fetch_params['limit'] = limit last_run = demisto.getLastRun() max_start_time_date = parse_date(MAX_START_TIME) assert max_start_time_date is not None max_time = max_start_time_date.timestamp() first_fetch_date = parse_date(params.get('first_fetch')) assert first_fetch_date is not None, f"could not parse {params.get('first_fetch')}" start_time = first_fetch_date.timestamp() if last_run and 'start_time' in last_run: start_time = last_run.get('start_time') if max_time > start_time: raise DemistoException('First fetch time can be a maximum of 90 days') fetch_params['date[start]'] = int(start_time) fetch_params['date[end]'] = int(now.timestamp()) return fetch_params
def handle(self, *args, **options): self.sleep_time = options['sleep'] / 1000 args = {} if options['organization']: args['organization__internal_id'] = options['organization'] if options['platform']: args['platform__short_name'] = options['platform'] if options['version']: args['counter_version'] = int(options['version']) credentials = list(SushiCredentials.objects.filter(**args)) cr_args = {'active': True} if options['version']: cr_args['counter_version'] = int(options['version']) if options['report']: cr_args['code'] = options['report'] # now fetch all possible combinations start_date = month_start(parse_date(options['start_date'])) end_date = month_end(parse_date(options['end_date'])) # we divide the requests to groups by platform and counter version combination # and then process each group in a separate thread platform_counter_v_to_requests = {} for cred in credentials: crs = list(cred.active_counter_reports.filter(**cr_args)) for cr in crs: key = (cred.platform_id, cred.counter_version) # check if we have a successful attempt already and skip it if yes success_req_for_skip = {'download_success': True, 'processing_success': True} \ if not options['skip_on_unsuccess'] else {} existing = SushiFetchAttempt.objects.filter( credentials=cred, counter_report=cr, start_date=start_date, end_date=end_date, **success_req_for_skip, ).exists() if existing: self.stderr.write( self.style.SUCCESS(f'Skipping existing {cred}, {cr}')) else: if key not in platform_counter_v_to_requests: platform_counter_v_to_requests[key] = [] platform_counter_v_to_requests[key].append( (cred, cr, start_date, end_date)) if not platform_counter_v_to_requests: self.stderr.write(self.style.WARNING('No matching reports found!')) return # let's create some threads and use them to process individual platforms with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor: for result in executor.map( self.download_list, list(platform_counter_v_to_requests.items())): pass
def compare_fields(label, value, fields): field = find(lambda f: f['label'] == label, fields) if field['type'] == 'NUMBER': if float( value ) == 0.0 and 'text' not in field: # 0 values are not sent in the response return assert_that(float(field['text'])).is_equal_to(float(value)) elif field['type'] == 'DATETIME' or field['type'] == 'DATE': date_field = parse_date(field['text']) date_value = parse_date(value) assert_that(date_field.date()).is_equal_to(date_value.date()) else: assert_that(field['text']).is_equal_to(value)
def extract_daterange(url, ref_date=None): if not url.startswith("/wiki/"): return None filename = "wiki/%s.html" % (url[6:].replace("/", "_"), ) if not os.path.exists(filename): return None with open(filename, 'r') as fp: html = fp.read() soup = BeautifulSoup(html, "html.parser") table = soup.find("table", {'class': "infobox"}) if not table: return None for row in table.find_all("tr"): columns = row.find_all(["th", "td"]) if len(columns) != 2: continue if columns[0].get_text().strip().lower() == "date": break else: return None element = columns[1] for child in reversed(element.find_all("sup")): child.extract() for br in soup.find_all("br"): br.replace_with("\n") return dateparser.parse_date(element.get_text(), ref_date=ref_date)
def parse_event(p): texts = list(p.children) line1 = texts[0].text.strip() content = list(texts[-1].children) date, location = line1.split("*") date = parse_date(date) location = location.strip() programme = [p.strip() for p in content[0].contents if is_text(p)] performer = None venue = None source = None for line in content[1:]: if is_text(line) and len(line.strip()) > 0: if performer is None: performer = line.strip() continue if venue is None: venue = line.strip() continue if line.name == "a": source = content[8].attrs["href"] return { "uid": "%s-%s" % (str(date.date()), location.lower().replace(" ", "/")), "date": date, "performer": performer, "venue": venue, "location": location, "programme": programme, "source": source, }
def parse_activity(activity, scraptime): activities = [_.strip().encode('utf-8') for _ in activity.xpath('ul/li/text()').extract()] text_activity = dict() for line in activities: first_word = line.split()[0] content = ':'.join(line.split(':')[1:]).strip() text_activity[first_word] = content num_post, reply_rate, last_ping, joined = 4*[None, ] if 'Annonces' in text_activity: num_post = int(text_activity['Annonces']) if 'Taux' in text_activity: reply_rate = int(text_activity['Taux'][:-1]) # for https://github.com/scrapinghub/dateparser/issues/268 last_ping = text_activity['Dernière'].replace('mar.', '') last_ping = last_ping.replace('lun. 29 févr.', 'lun. 29 févr. 2016') last_ping = int((scraptime - parse_date(last_ping)).total_seconds()) joined = parse_date(text_activity['Membre']).date() return num_post, reply_rate, last_ping, joined
def get_release_date(self, info_box): """ a helper method to get the release date of the movie :param info_box: the beautiful soup object of the info box :return: the release date of the movie, or None if cannot parse """ try: release_date = info_box.find("span", attrs={ "class": "published" }).text return parse_date(release_date) except AttributeError: return None
def init_dummy_data(db: SQLAlchemy): cpf = CPF() generated_cpf = cpf.generate() d1 = parse_date(u'1900/01/01 00:00:00') d2 = parse_date(u'2020/01/01 23:59:59') me = Colaborador() me.cpf = generated_cpf, me.nome = 'John', me.prenome = '', me.sobrenome = 'Smith', me.cargo = 'Functional Test Engineer', me.status = 'Ativo', me.data_nascimento = random_date(d1, d2), me.email = '*****@*****.**', me.endereco = 'Rua Um Dois Três Quatro', me.numero = '567', me.complemento = 'ap 890', me.bairro = 'Fixe Giro', me.cidade = 'São Paulo', me.estado = 'SP', me.cep = '00000000', me.ddd = '11', me.telefone = '987654321' db.session.add(me) db.session.commit() mi = Ponto() mi.data_hora = random_date(d1, d2) mi.cpf_colaborador = generated_cpf db.session.add(mi) db.session.commit()
def get_html(self, url, closest_datetime_str=None): canonical = canonicalize(url) metas = self.meta_index.get(canonical) if not metas: return None metas = [json.loads(m) for m in metas] metas = [m for m in metas if m['status'] == "200"] if len(metas) > 1 and closest_datetime_str and sort_by_closest( metas, parse_date(closest_datetime_str)): pass # successfully sorted metas by reference time else: metas = sorted(metas, key=lambda m: m['filename'], reverse=True) html = get_first_or_none( map(CommonCrawlS3.fetch_html_from_s3_file, metas)) return html
def _parse_task(self, user, task): """ """ if 'date' in task: try: task['date'] = parse_date(task['date']) except: raise ValueError(f"Failed to parse date {task['date']}") else: task['date'] = self._determine_suitable_date(user) if 'period' in task: task['period'] = int(task['period']) else: task['period'] = self._determine_suitable_period(user) return task
def parse_parking_session(el): try: rate = el.find_element_by_class_name("rate-option-details").text except NoSuchElementException: rate = "none" return ParkingSession( LicensePlate=el.find_element_by_class_name("license-plate").text, LocationNumber=el.find_element_by_class_name( "location-number").text, ExpiryDate=parse_date( el.find_element_by_class_name( "expiry-date").find_element_by_tag_name("strong").text), RateOption=rate, )
def complete(self, user, message): """ Register that you've completed a task :param user: :param message: :return: """ task = self.parse_message(user, message) if 'date' in task: date = parse_date(task['date']) else: date = get_current_datetime() task = self.db.get_task(user=user, shortcut=task['shortcut']) task.completions.append(date) task.date = date + datetime.timedelta(days=task.period) self.db.update_task(task)
def _get_datetime_obj_from_value(self, value: str): if self.search: matches = search_dates( text=value, languages=self._languages, ) return matches[0][-1] if matches else None return parse_date( date_string=value, date_formats=self._date_formats, languages=self._languages, locales=self._locales, region=self._region, )
def list(self, user, message): """ Get tasks for particular date. """ message = trim(message, '/list') if message.strip(): date = parse_date(message) else: date = get_current_datetime() self.actualize_tasks() tasks = self.db.get_users_tasks(user) # need to cast into date because date is datetime with hours etc. tasks = [task for task in tasks if to_date(task.date) == to_date(date)] response = date.strftime("Tasks for %a, %d %b\n") response += "\n".join([task.text for task in tasks]) return response
def update(self, item): """ Update current movie node using other actor node :param item: the item or dict representing other movie """ if not (isinstance(item, MovieItem) or isinstance(item, dict)): return self.name = item.get("name", self.name) self.box_office = item.get( "box_office", 0 if self.box_office is None else self.box_office) self.wiki_page = get_wiki_page(item.get("wiki_page", self.wiki_page)) self.release_date = item.get("release_date", self.release_date) # parse release date if it is string if isinstance(self.release_date, str): self.release_date = parse_date(self.release_date) # setup release date if self.release_date is None and "year" in item: year = item.get("year") if MINYEAR <= year <= MAXYEAR: self.release_date = datetime(year, 1, 1)
def fetch_new_sushi_data(credentials: Optional[SushiCredentials] = None): """ Goes over all the report types and credentials in the database and tries to fetch new sushi data where it makes sense :param credentials: if given, only report_types related to credentials organization and platform will be processed :return: """ fetch_units = create_fetch_units() if credentials: fetch_units = filter_fetch_units_by_credentials(fetch_units, credentials) lock_name_to_units = split_fetch_units_by_url_lock_name(fetch_units) start_date = month_start(month_start(now().date()) - timedelta(days=15)) # start of prev month end_date = month_start(parse_date(settings.SUSHI_ATTEMPT_LAST_DATE)) # do not use lock, we lock the whole queue with same URL processing_fn = partial(process_fetch_units_wrapper, conflict_ok='skip', conflict_error='smart', sleep_time=2, use_lock=False) args = [(lock_name, fus, start_date, end_date) for lock_name, fus in lock_name_to_units.items()] logger.info('Starting processing of %d sushi fetching queues', len(args)) with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor: for result in executor.map(processing_fn, args): pass
def parse(self, response): # user # info id_ = response.url.split('/')[-1].split('?')[0] if id_ in self.seen_users: return if id_ not in self.all_reviews: self.all_reviews[id_] = list() reviews_div = response.css('div.user-comments-container') for r in reviews_div.xpath('div/ul/li/article'): from_ = r.xpath('a/@href').extract_first() if not from_: # a reviewer without profile, like Estelle here https://www.blablacar.fr/membre/profil/Hx6O3ju2jAXgT7MpEKijcg continue from_ = from_.split('/')[-1].encode('ascii') text_raw = r.xpath('div[contains(@class, "Speech-content")]/p/text()').extract() text = ''.join((_.strip().encode('utf-8').replace('\n', ' ') for _ in text_raw)) grade_class = r.xpath('div[contains(@class, "Speech-content")]/h3/@class') grade = [int(i.split('--')[-1]) for i in grade_class.extract_first().split() if '--' in i][0] when = parse_date(r.xpath('footer/time/@datetime').extract_first()) review = {'from': from_, 'grade': grade, 'text': text, 'when': str(when)} reply = r.xpath('aside[contains(@class, "Rating-rightOfReply")]/blockquote/text()').extract_first() if reply: review['reply'] = reply.strip().encode('utf-8') self.all_reviews[id_].append(review) next_link = reviews_div.xpath('div[contains(@class, "pagination")]/ul/li[contains(@class, "next")]/a/@href').extract_first() if next_link: yield scrapy.Request('{}{}'.format(WEBSITE, next_link), callback=self.parse) return scraptime = datetime.utcnow() name = response.css('h1.ProfileCard-info--name::text').extract_first().strip().encode('utf-8') age = int(response.css('div.ProfileCard-info:nth-child(2)::text').extract_first().strip().split()[0]) profile_info = response.css('div.ProfileCard').css('div.ProfileCard-row') experience = profile_info[0].css('span.megatip-popover::text').extract_first() if experience is not None: experience = experience.strip().encode('utf-8') # might although be oldtitle instead of title prefs = [_.strip().encode('utf-8') for _ in profile_info[-1].xpath('span[contains(@class, "big-prefs")]/@title').extract()] bio_raw = response.xpath('//div[contains(@class, "member-bio")]/p/text()').extract() bio = ''.join((_.strip().encode('utf-8').replace('\n', ' ') for _ in bio_raw)).strip('""') # activity num_post, reply_rate, last_ping, joined = parse_activity(response.css('div.main-column-block:nth-child(2)'), scraptime) id_verified = response.css('div.ProfileCard-info.u-blue::text').extract_first() verif = [_.strip().encode('utf-8') for _ in response.css('ul.verification-list li span::text').extract()] if id_verified: verif.append(id_verified.strip().encode('utf-8')) # car car = response.css('ul.user-car-details li') if car: car_name = car[0].xpath('h4/strong/text()').extract_first().strip().encode('utf-8') car_color, car_comfort = [_.strip().split(':')[-1].strip().encode('utf-8') for _ in car[1:].xpath('text()').extract()] car_comfort_num = [int(i.split('_')[-1]) for i in car[0].xpath('h4/span/@class').extract_first().split() if 'star' in i][0] these_reviews = list(self.all_reviews[id_]) del self.all_reviews[id_] logging.info('DONE_WITH {}'.format(id_)) self.seen_users.add(id_) yield {'id': id_, 'name': name, 'age': age, 'experience': experience, 'preferences': prefs, 'biography': bio, 'num_posts': num_post, 'reply_rate': reply_rate, 'joined': str(joined), 'last_ping': last_ping, 'scraptime': str(scraptime), 'car': None if len(car) == 0 else {'name': car_name, 'color': car_color, 'comfort': car_comfort_num}, 'verifications': verif, 'reviews': these_reviews, }
def rider_details(self, response): #xpath details final_route = '//h1[@class="RideName RideName--title"]//span//text()' dep_point = '//span[@data-position="departure"]/span/text()' drop_point = '//span[@data-position="arrival"]/span/text()' dep_date = '//strong[@class="RideDetails-infoValue"]//span/text()' options = '//span[@class="u-alignMiddle"]/text()' price = '//span[@class="Booking-price u-block"]/text()' seats = '//span[@class="Booking-seats u-block"]/b/text()' own_image = '//div[@class="ProfileCard"]//div[@class="ProfileCard-picture"]//img/@src' own_name = '//div[@class="ProfileCard"]//div[@class="ProfileCard-infosBlock"]//h4//text()' own_desc = '//div[@class="ProfileCard"]//div[@class="ProfileCard-infosBlock"]//div[@class="ProfileCard-info u-blue"]/text()' rate = '//span[@class="u-textBold u-darkGray"]/text()' own_age = '//div[@class="ProfileCard"]//div[@class="ProfileCard-infosBlock"]//div[@class="ProfileCard-info"]/text()' car = '//div[@class="Profile-car u-table"]//p[@class="Profile-carDetails u-cell"]/text()' #data extraction route = response.xpath(final_route).extract() dep_point = response.xpath(dep_point).extract() drop_point = response.xpath(drop_point).extract() dep_date = response.xpath(dep_date).extract() options = response.xpath(options).extract() price = response.xpath(price).extract() seats = response.xpath(seats).extract() image = response.xpath(own_image).extract() name = response.xpath(own_name).extract() age = response.xpath(own_age).extract() description = response.xpath(own_desc).extract() rating = response.xpath(rate).extract() car = response.xpath(car).extract() #cleaning of data departure = ''.join(dep_point).strip() droping = ''.join(drop_point).strip() dep_date = ''.join(dep_date).strip() dep_date = parse_date(dep_date) options = ''.join(options).strip() price = ''.join(price).strip().replace('₹\xa0', '') seats = ''.join(seats).strip() image = ''.join(image).strip() name = ''.join(name).strip() age = ''.join(age).strip().replace(' years old', '') description = ''.join(description).strip() rating = ''.join(rating).strip() if route: source = route[0] destination = route[-1] car_model = "" car_color = "" if car: car_model = car[0].strip() car_color = car[1].strip() car_color = car_color.split('Colour: ') car_color = car_color[-1] yield blablacarItem(source=source, destination=destination, departure_point=departure, drop_off_point=droping, departure_date=dep_date, options=options, seats_left=seats, car_owner_image=image, car_owner_name=name, car_owner_age=age, car_owner_verification=description, car_owner_rating=rating, car_model=car_model, car_colour=car_color)
def date(str_date): return parse_date(str_date)
def __from_to_helper(self, validate=None): try: validate = parse_date(validate).strftime('%Y-%m-%d') return str(validate) except: raise Exception('Could not parse your date')
def datetime_to_canon(date: str) -> Optional[str]: dt = parse_date(date) if dt is not None: return dt.strftime("%Y-%m-%dT%H:%M")