Пример #1
0
def _set_fetch_params(params, now):

    fetch_params: Dict[str, Any] = {'meta[total]': True, 'stats': True}

    limit = arg_to_number(
        arg=demisto.params().get('max_fetch'),
        arg_name='max_fetch',
        required=False
    )

    if limit is not None:
        if limit > 10000:
            demisto.info('Adjusting artifact limit to maximum of 10000')
            limit = 10000

        fetch_params['limit'] = limit

    last_run = demisto.getLastRun()

    max_start_time_date = parse_date(MAX_START_TIME)
    assert max_start_time_date is not None
    max_time = max_start_time_date.timestamp()
    first_fetch_date = parse_date(params.get('first_fetch'))
    assert first_fetch_date is not None, f"could not parse {params.get('first_fetch')}"
    start_time = first_fetch_date.timestamp()
    if last_run and 'start_time' in last_run:
        start_time = last_run.get('start_time')

    if max_time > start_time:
        raise DemistoException('First fetch time can be a maximum of 90 days')

    fetch_params['date[start]'] = int(start_time)
    fetch_params['date[end]'] = int(now.timestamp())
    return fetch_params
Пример #2
0
 def handle(self, *args, **options):
     self.sleep_time = options['sleep'] / 1000
     args = {}
     if options['organization']:
         args['organization__internal_id'] = options['organization']
     if options['platform']:
         args['platform__short_name'] = options['platform']
     if options['version']:
         args['counter_version'] = int(options['version'])
     credentials = list(SushiCredentials.objects.filter(**args))
     cr_args = {'active': True}
     if options['version']:
         cr_args['counter_version'] = int(options['version'])
     if options['report']:
         cr_args['code'] = options['report']
     # now fetch all possible combinations
     start_date = month_start(parse_date(options['start_date']))
     end_date = month_end(parse_date(options['end_date']))
     # we divide the requests to groups by platform and counter version combination
     # and then process each group in a separate thread
     platform_counter_v_to_requests = {}
     for cred in credentials:
         crs = list(cred.active_counter_reports.filter(**cr_args))
         for cr in crs:
             key = (cred.platform_id, cred.counter_version)
             # check if we have a successful attempt already and skip it if yes
             success_req_for_skip = {'download_success': True, 'processing_success': True} \
                 if not options['skip_on_unsuccess'] else {}
             existing = SushiFetchAttempt.objects.filter(
                 credentials=cred,
                 counter_report=cr,
                 start_date=start_date,
                 end_date=end_date,
                 **success_req_for_skip,
             ).exists()
             if existing:
                 self.stderr.write(
                     self.style.SUCCESS(f'Skipping existing {cred}, {cr}'))
             else:
                 if key not in platform_counter_v_to_requests:
                     platform_counter_v_to_requests[key] = []
                 platform_counter_v_to_requests[key].append(
                     (cred, cr, start_date, end_date))
     if not platform_counter_v_to_requests:
         self.stderr.write(self.style.WARNING('No matching reports found!'))
         return
     # let's create some threads and use them to process individual platforms
     with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
         for result in executor.map(
                 self.download_list,
                 list(platform_counter_v_to_requests.items())):
             pass
Пример #3
0
def compare_fields(label, value, fields):
    field = find(lambda f: f['label'] == label, fields)
    if field['type'] == 'NUMBER':
        if float(
                value
        ) == 0.0 and 'text' not in field:  # 0 values are not sent in the response
            return
        assert_that(float(field['text'])).is_equal_to(float(value))
    elif field['type'] == 'DATETIME' or field['type'] == 'DATE':
        date_field = parse_date(field['text'])
        date_value = parse_date(value)
        assert_that(date_field.date()).is_equal_to(date_value.date())
    else:
        assert_that(field['text']).is_equal_to(value)
Пример #4
0
def extract_daterange(url, ref_date=None):
    if not url.startswith("/wiki/"):
        return None

    filename = "wiki/%s.html" % (url[6:].replace("/", "_"), )
    if not os.path.exists(filename):
        return None

    with open(filename, 'r') as fp:
        html = fp.read()

    soup = BeautifulSoup(html, "html.parser")

    table = soup.find("table", {'class': "infobox"})
    if not table:
        return None

    for row in table.find_all("tr"):
        columns = row.find_all(["th", "td"])
        if len(columns) != 2:
            continue
        if columns[0].get_text().strip().lower() == "date":
            break
    else:
        return None

    element = columns[1]
    for child in reversed(element.find_all("sup")):
        child.extract()

    for br in soup.find_all("br"):
        br.replace_with("\n")

    return dateparser.parse_date(element.get_text(), ref_date=ref_date)
Пример #5
0
def parse_event(p):
    texts = list(p.children)
    line1 = texts[0].text.strip()
    content = list(texts[-1].children)
    date, location = line1.split("*")
    date = parse_date(date)
    location = location.strip()
    programme = [p.strip() for p in content[0].contents if is_text(p)]
    performer = None
    venue = None
    source = None
    for line in content[1:]:
        if is_text(line) and len(line.strip()) > 0:
            if performer is None:
                performer = line.strip()
                continue
            if venue is None:
                venue = line.strip()
                continue
        if line.name == "a":
            source = content[8].attrs["href"]

    return {
        "uid":
        "%s-%s" % (str(date.date()), location.lower().replace(" ", "/")),
        "date": date,
        "performer": performer,
        "venue": venue,
        "location": location,
        "programme": programme,
        "source": source,
    }
Пример #6
0
def parse_activity(activity, scraptime):
    activities = [_.strip().encode('utf-8')
                  for _ in activity.xpath('ul/li/text()').extract()]
    text_activity = dict()
    for line in activities:
        first_word = line.split()[0]
        content = ':'.join(line.split(':')[1:]).strip()
        text_activity[first_word] = content
    num_post, reply_rate, last_ping, joined = 4*[None, ]
    if 'Annonces' in text_activity:
        num_post = int(text_activity['Annonces'])
    if 'Taux' in text_activity:
        reply_rate = int(text_activity['Taux'][:-1])
    # for https://github.com/scrapinghub/dateparser/issues/268
    last_ping = text_activity['Dernière'].replace('mar.', '')
    last_ping = last_ping.replace('lun. 29 févr.', 'lun. 29 févr. 2016')
    last_ping = int((scraptime - parse_date(last_ping)).total_seconds())
    joined = parse_date(text_activity['Membre']).date()
    return num_post, reply_rate, last_ping, joined
Пример #7
0
 def get_release_date(self, info_box):
     """
     a helper method to get the release date of the movie
     :param info_box: the beautiful soup object of the info box
     :return: the release date of the movie, or None if cannot parse
     """
     try:
         release_date = info_box.find("span", attrs={
             "class": "published"
         }).text
         return parse_date(release_date)
     except AttributeError:
         return None
Пример #8
0
def init_dummy_data(db: SQLAlchemy):

    cpf = CPF()
    generated_cpf = cpf.generate()
    d1 = parse_date(u'1900/01/01 00:00:00')
    d2 = parse_date(u'2020/01/01 23:59:59')

    me = Colaborador()

    me.cpf = generated_cpf,
    me.nome = 'John',
    me.prenome = '',
    me.sobrenome = 'Smith',
    me.cargo = 'Functional Test Engineer',
    me.status = 'Ativo',
    me.data_nascimento = random_date(d1, d2),
    me.email = '*****@*****.**',
    me.endereco = 'Rua Um Dois Três Quatro',
    me.numero = '567',
    me.complemento = 'ap 890',
    me.bairro = 'Fixe Giro',
    me.cidade = 'São Paulo',
    me.estado = 'SP',
    me.cep = '00000000',
    me.ddd = '11',
    me.telefone = '987654321'

    db.session.add(me)
    db.session.commit()

    mi = Ponto()

    mi.data_hora = random_date(d1, d2)
    mi.cpf_colaborador = generated_cpf

    db.session.add(mi)
    db.session.commit()
Пример #9
0
    def get_html(self, url, closest_datetime_str=None):
        canonical = canonicalize(url)
        metas = self.meta_index.get(canonical)
        if not metas: return None
        metas = [json.loads(m) for m in metas]
        metas = [m for m in metas if m['status'] == "200"]
        if len(metas) > 1 and closest_datetime_str and sort_by_closest(
                metas, parse_date(closest_datetime_str)):
            pass  # successfully sorted metas by reference time
        else:
            metas = sorted(metas, key=lambda m: m['filename'], reverse=True)

        html = get_first_or_none(
            map(CommonCrawlS3.fetch_html_from_s3_file, metas))
        return html
Пример #10
0
 def _parse_task(self, user, task):
     """
     """
     if 'date' in task:
         try:
             task['date'] = parse_date(task['date'])
         except:
             raise ValueError(f"Failed to parse date {task['date']}")
     else:
         task['date'] = self._determine_suitable_date(user)
     if 'period' in task:
         task['period'] = int(task['period'])
     else:
         task['period'] = self._determine_suitable_period(user)
     return task
Пример #11
0
    def parse_parking_session(el):
        try:
            rate = el.find_element_by_class_name("rate-option-details").text
        except NoSuchElementException:
            rate = "none"

        return ParkingSession(
            LicensePlate=el.find_element_by_class_name("license-plate").text,
            LocationNumber=el.find_element_by_class_name(
                "location-number").text,
            ExpiryDate=parse_date(
                el.find_element_by_class_name(
                    "expiry-date").find_element_by_tag_name("strong").text),
            RateOption=rate,
        )
Пример #12
0
 def complete(self, user, message):
     """
     Register that you've completed a task
     :param user:
     :param message:
     :return:
     """
     task = self.parse_message(user, message)
     if 'date' in task:
         date = parse_date(task['date'])
     else:
         date = get_current_datetime()
     task = self.db.get_task(user=user, shortcut=task['shortcut'])
     task.completions.append(date)
     task.date = date + datetime.timedelta(days=task.period)
     self.db.update_task(task)
Пример #13
0
    def _get_datetime_obj_from_value(self, value: str):
        if self.search:
            matches = search_dates(
                text=value,
                languages=self._languages,
            )

            return matches[0][-1] if matches else None

        return parse_date(
            date_string=value,
            date_formats=self._date_formats,
            languages=self._languages,
            locales=self._locales,
            region=self._region,
        )
Пример #14
0
    def list(self, user, message):
        """
        Get tasks for particular date.

        """
        message = trim(message, '/list')
        if message.strip():
            date = parse_date(message)
        else:
            date = get_current_datetime()

        self.actualize_tasks()
        tasks = self.db.get_users_tasks(user)

        # need to cast into date because date is datetime with hours etc.
        tasks = [task for task in tasks if to_date(task.date) == to_date(date)]

        response = date.strftime("Tasks for %a, %d %b\n")
        response += "\n".join([task.text for task in tasks])
        return response
Пример #15
0
    def update(self, item):
        """
        Update current movie node using other actor node
        :param item: the item or dict representing other movie
        """
        if not (isinstance(item, MovieItem) or isinstance(item, dict)):
            return
        self.name = item.get("name", self.name)
        self.box_office = item.get(
            "box_office", 0 if self.box_office is None else self.box_office)
        self.wiki_page = get_wiki_page(item.get("wiki_page", self.wiki_page))

        self.release_date = item.get("release_date", self.release_date)

        # parse release date if it is string
        if isinstance(self.release_date, str):
            self.release_date = parse_date(self.release_date)

        # setup release date
        if self.release_date is None and "year" in item:
            year = item.get("year")
            if MINYEAR <= year <= MAXYEAR:
                self.release_date = datetime(year, 1, 1)
Пример #16
0
def fetch_new_sushi_data(credentials: Optional[SushiCredentials] = None):
    """
    Goes over all the report types and credentials in the database and tries to fetch
    new sushi data where it makes sense
    :param credentials: if given, only report_types related to credentials organization and
                        platform will be processed
    :return:
    """
    fetch_units = create_fetch_units()
    if credentials:
        fetch_units = filter_fetch_units_by_credentials(fetch_units, credentials)
    lock_name_to_units = split_fetch_units_by_url_lock_name(fetch_units)
    start_date = month_start(month_start(now().date()) - timedelta(days=15))  # start of prev month
    end_date = month_start(parse_date(settings.SUSHI_ATTEMPT_LAST_DATE))
    # do not use lock, we lock the whole queue with same URL
    processing_fn = partial(process_fetch_units_wrapper,
                            conflict_ok='skip', conflict_error='smart', sleep_time=2,
                            use_lock=False)
    args = [(lock_name, fus, start_date, end_date)
            for lock_name, fus in lock_name_to_units.items()]
    logger.info('Starting processing of %d sushi fetching queues', len(args))
    with concurrent.futures.ThreadPoolExecutor(max_workers=16) as executor:
        for result in executor.map(processing_fn, args):
            pass
Пример #17
0
    def parse(self, response):
        # user
        #  info
        id_ = response.url.split('/')[-1].split('?')[0]
        if id_ in self.seen_users:
            return
        if id_ not in self.all_reviews:
            self.all_reviews[id_] = list()

        reviews_div = response.css('div.user-comments-container')
        for r in reviews_div.xpath('div/ul/li/article'):
            from_ = r.xpath('a/@href').extract_first()
            if not from_:
                # a reviewer without profile, like Estelle here https://www.blablacar.fr/membre/profil/Hx6O3ju2jAXgT7MpEKijcg
                continue
            from_ = from_.split('/')[-1].encode('ascii')
            text_raw = r.xpath('div[contains(@class, "Speech-content")]/p/text()').extract()
            text = ''.join((_.strip().encode('utf-8').replace('\n', ' ') for _ in text_raw))
            grade_class = r.xpath('div[contains(@class, "Speech-content")]/h3/@class')
            grade = [int(i.split('--')[-1]) for i in grade_class.extract_first().split() if '--' in i][0]
            when = parse_date(r.xpath('footer/time/@datetime').extract_first())
            review = {'from': from_, 'grade': grade, 'text': text, 'when': str(when)}
            reply = r.xpath('aside[contains(@class, "Rating-rightOfReply")]/blockquote/text()').extract_first()
            if reply:
                review['reply'] = reply.strip().encode('utf-8')
            self.all_reviews[id_].append(review)

        next_link = reviews_div.xpath('div[contains(@class, "pagination")]/ul/li[contains(@class, "next")]/a/@href').extract_first()
        if next_link:
            yield scrapy.Request('{}{}'.format(WEBSITE, next_link), callback=self.parse)
            return

        scraptime = datetime.utcnow()
        name = response.css('h1.ProfileCard-info--name::text').extract_first().strip().encode('utf-8')
        age = int(response.css('div.ProfileCard-info:nth-child(2)::text').extract_first().strip().split()[0])
        profile_info = response.css('div.ProfileCard').css('div.ProfileCard-row')
        experience = profile_info[0].css('span.megatip-popover::text').extract_first()
        if experience is not None:
            experience = experience.strip().encode('utf-8')
        # might although be oldtitle instead of title
        prefs = [_.strip().encode('utf-8') for _ in profile_info[-1].xpath('span[contains(@class, "big-prefs")]/@title').extract()]
        bio_raw = response.xpath('//div[contains(@class, "member-bio")]/p/text()').extract()
        bio = ''.join((_.strip().encode('utf-8').replace('\n', ' ') for _ in bio_raw)).strip('""')
        #  activity
        num_post, reply_rate, last_ping, joined = parse_activity(response.css('div.main-column-block:nth-child(2)'), scraptime)
        id_verified = response.css('div.ProfileCard-info.u-blue::text').extract_first()
        verif = [_.strip().encode('utf-8') for _ in response.css('ul.verification-list li span::text').extract()]
        if id_verified:
            verif.append(id_verified.strip().encode('utf-8'))

        # car
        car = response.css('ul.user-car-details li')
        if car:
            car_name = car[0].xpath('h4/strong/text()').extract_first().strip().encode('utf-8')
            car_color, car_comfort = [_.strip().split(':')[-1].strip().encode('utf-8')
                                      for _ in car[1:].xpath('text()').extract()]
            car_comfort_num = [int(i.split('_')[-1])
                               for i in car[0].xpath('h4/span/@class').extract_first().split()
                               if 'star' in i][0]

        these_reviews = list(self.all_reviews[id_])
        del self.all_reviews[id_]
        logging.info('DONE_WITH {}'.format(id_))
        self.seen_users.add(id_)
        yield {'id': id_,
               'name': name,
               'age': age,
               'experience': experience,
               'preferences': prefs,
               'biography': bio,
               'num_posts': num_post,
               'reply_rate': reply_rate,
               'joined': str(joined),
               'last_ping': last_ping,
               'scraptime': str(scraptime),
               'car': None if len(car) == 0 else {'name': car_name, 'color': car_color, 'comfort': car_comfort_num},
               'verifications': verif,
               'reviews': these_reviews,
               }
Пример #18
0
    def rider_details(self, response):
        #xpath details

        final_route = '//h1[@class="RideName RideName--title"]//span//text()'
        dep_point = '//span[@data-position="departure"]/span/text()'
        drop_point = '//span[@data-position="arrival"]/span/text()'
        dep_date = '//strong[@class="RideDetails-infoValue"]//span/text()'
        options = '//span[@class="u-alignMiddle"]/text()'
        price = '//span[@class="Booking-price u-block"]/text()'
        seats = '//span[@class="Booking-seats u-block"]/b/text()'
        own_image = '//div[@class="ProfileCard"]//div[@class="ProfileCard-picture"]//img/@src'
        own_name = '//div[@class="ProfileCard"]//div[@class="ProfileCard-infosBlock"]//h4//text()'
        own_desc = '//div[@class="ProfileCard"]//div[@class="ProfileCard-infosBlock"]//div[@class="ProfileCard-info u-blue"]/text()'
        rate = '//span[@class="u-textBold u-darkGray"]/text()'
        own_age = '//div[@class="ProfileCard"]//div[@class="ProfileCard-infosBlock"]//div[@class="ProfileCard-info"]/text()'
        car = '//div[@class="Profile-car u-table"]//p[@class="Profile-carDetails u-cell"]/text()'

        #data extraction
        route = response.xpath(final_route).extract()
        dep_point = response.xpath(dep_point).extract()
        drop_point = response.xpath(drop_point).extract()
        dep_date = response.xpath(dep_date).extract()
        options = response.xpath(options).extract()
        price = response.xpath(price).extract()
        seats = response.xpath(seats).extract()
        image = response.xpath(own_image).extract()
        name = response.xpath(own_name).extract()
        age = response.xpath(own_age).extract()
        description = response.xpath(own_desc).extract()
        rating = response.xpath(rate).extract()
        car = response.xpath(car).extract()

        #cleaning of data
        departure = ''.join(dep_point).strip()
        droping = ''.join(drop_point).strip()
        dep_date = ''.join(dep_date).strip()
        dep_date = parse_date(dep_date)
        options = ''.join(options).strip()
        price = ''.join(price).strip().replace('₹\xa0', '')
        seats = ''.join(seats).strip()
        image = ''.join(image).strip()
        name = ''.join(name).strip()
        age = ''.join(age).strip().replace(' years old', '')
        description = ''.join(description).strip()
        rating = ''.join(rating).strip()

        if route:
            source = route[0]
            destination = route[-1]

        car_model = ""
        car_color = ""

        if car:
            car_model = car[0].strip()
            car_color = car[1].strip()
            car_color = car_color.split('Colour: ')
            car_color = car_color[-1]

        yield blablacarItem(source=source,
                            destination=destination,
                            departure_point=departure,
                            drop_off_point=droping,
                            departure_date=dep_date,
                            options=options,
                            seats_left=seats,
                            car_owner_image=image,
                            car_owner_name=name,
                            car_owner_age=age,
                            car_owner_verification=description,
                            car_owner_rating=rating,
                            car_model=car_model,
                            car_colour=car_color)
Пример #19
0
def date(str_date):
    return parse_date(str_date)
Пример #20
0
 def __from_to_helper(self, validate=None):
     try:
         validate = parse_date(validate).strftime('%Y-%m-%d')
         return str(validate)
     except:
         raise Exception('Could not parse your date')
Пример #21
0
def datetime_to_canon(date: str) -> Optional[str]:
    dt = parse_date(date)
    if dt is not None:
        return dt.strftime("%Y-%m-%dT%H:%M")