def parse_classes(self, response):
     date = None
     for row in response.css('table#classSchedule-mainTable tr'):
         header = row.css('td.header')
         cells = row.css('td')
         if header:
             date = dateparser.parse(self._extract_text(header)).date()
         elif len(cells) in [5, 6]:
             item = items.StudioClass()
             lst = row.css('td')
             if len(lst) == 5:
                 start_time, dummy, class_name, teacher, duration = [self._extract_text(x) for x in lst]
             elif len(lst) == 6:
                 start_time, dummy, class_name, teacher, room, duration = [self._extract_text(x) for x in lst]
             if 'Cancelled' in teacher:
                 continue
             start_time = dateparser.parse(start_time).time()
             duration_hours_match = re.search('(\d+) hour', duration)
             duration_hours = duration_hours_match.group(1) if duration_hours_match else 0
             duration_minutes_match = re.search('(\d+) minute', duration)
             duration_minutes = duration_minutes_match.group(1) if duration_minutes_match else 0
             duration = datetime.timedelta(hours=int(duration_hours), minutes=int(duration_minutes))
             item['start_time'] = datetime.datetime.combine(date, start_time)
             item['end_time'] = datetime.datetime.combine(date, start_time) + duration
             item['style'] = class_name
             if ' for ' in teacher:
                 sub, orig = teacher.split(' for ')
                 sub = adjust_caps(sub)
                 orig = adjust_caps(orig)
                 teacher = '%s for %s' % (sub, orig)
             teacher = adjust_caps(teacher)
             item['teacher'] = teacher
             if self._valid_item(item, row):
                 yield item
Пример #2
0
def scrape_conan():
    """
    Scrapes Conan's standup page
    """
    page = 'http://teamcoco.com/category/video-category/standup'
    df_conan = pd.DataFrame(columns=['title', 'link', 'date'])     # Initialize dataframe

    page = requests.get(page)
    soup = BeautifulSoup(page.text, 'lxml')

    main = soup.find_all('div', class_='content-item-hero')     # Main video
    posts = soup.find_all('li', class_='content-item')          # remaining videos on bottom

    # populate data frame with content item hero
    link = main[0].a['href']
    date = re.findall('\d{2}-\d{2}-\d{2}', link)
    date = dateparser.parse(date[0])
    title = main[0].strong.contents[0]
    entry = pd.DataFrame([[title, link, date]], columns=['title', 'link', 'date'])
    df_conan = df_conan.append(entry, ignore_index=True)

    for p in posts:
        link = p.a['href']
        date = re.findall('\d{2}-\d{2}-\d{2}', link)
        date = dateparser.parse(date[0])
        title = p.img['alt']
        entry = pd.DataFrame([[title, link, date]], columns=['title', 'link', 'date'])
        df_conan = df_conan.append(entry, ignore_index=True)

    return df_conan
Пример #3
0
    def _save_to_db(self, date):
        news_retriever = self._news_retriever()
        bunch_of_news = []
        if not date:
            bunch_of_news = news_retriever
        else:
            for index, i in enumerate(news_retriever):
                if dateparser.parse(i.pubdate.text, settings={'TIMEZONE': 'Europe/Moscow'}) > date:
                    bunch_of_news.append(i)
                else:
                    break

        for item in bunch_of_news:
            print("[RSS]: {0}".format(item.title.text))
            try:
                item_category = Category.objects.filter(category_name=item.category.text)[0]

                news_db_obj = News(title=item.title.text, description=item.description.text,
                               time=dateparser.parse(item.pubdate.text, settings={'TIMEZONE': 'Europe/Moscow'}),
                               category=item_category)
            except IndexError:

                new_category = Category(category_name=item.category.text)
                new_category.save()
                news_db_obj = News(title=item.title.text, description=item.description.text,
                                   time=dateparser.parse(item.pubdate.text, settings={'TIMEZONE': 'Europe/Moscow'}),
                                   category=new_category)
            news_db_obj.save()
Пример #4
0
def _extractFromHTMLTag(parsedHTML):
    #<time>
    for time in parsedHTML.findAll("time"):
        datetime = time.get('datetime', '')
        if len(datetime) > 0:
            return dateparser.parse(datetime)

        datetime = time.get('class', '')
        if len(datetime) > 0 and datetime[0].lower() == "timestamp":
            return dateparser.parse(time.string)


    tag = parsedHTML.find("span", {"itemprop": "datePublished"})
    if tag is not None:
        dateText = tag.get("content")
        if dateText is None:
            dateText = tag.text
        if dateText is not None:
            return dateparser.parse(dateText)

    #class=
    for tag in parsedHTML.find_all(['span', 'p','div'], class_=re.compile("pubdate|timestamp|article_date|articledate|date",re.IGNORECASE)):
        dateText = tag.string
        if dateText is None:
            dateText = tag.text

        possibleDate = dateparser.parse(dateText)

        if possibleDate is not None:
            return  possibleDate



    return None
Пример #5
0
def RDparsetime(timestr):
    try:
        if timestr:
            pubtime = dateparser.parse(timestr)
            if pubtime==None:
                if not u' ' in timestr:
                    pubtime = dateutil.parser.parse(timestr,fuzzy=True)
                else:
                    day = timestr.split(u' ',1)[0]
                    temp = timestr.split(u' ',1)[1]
                    dateru = u''
                    if u'時間前' == temp or u'小時前' == temp:
                        dateru = day + u' час назад'
                    else:
                        if u'分前' == temp or u'分鐘前' == temp:
                            dateru = day + u' минут назад'
                    pubtime = dateparser.parse(dateru)
            else:
                pass
            if pubtime:
                pubtimeint = int(time.mktime(pubtime.timetuple()))
                return pubtimeint
            else:
                return 0
        else:
            return 0
    except BaseException as e:
        print e
        return 0
Пример #6
0
def index(request):
    if request.method == 'GET':
        #obtaining categories to show in template
        categories = Category.objects.values('category_name').order_by('category_name')

        context = {
            'categories': categories,
        }

        return render(request, 'index.html', context)

    elif request.method == 'POST':
        request_body = json.loads(request.body.decode('utf-8'))

        begin_date = dateparser.parse(request_body['start_date'], settings={'TIMEZONE': 'Europe/Moscow'})
        end_date = dateparser.parse(request_body['end_date'], settings={'TIMEZONE': 'Europe/Moscow'})
        category = list(request_body['category']) if type(request_body['category']) is not list else request_body['category']
        e_mail = request_body['email']

        print("[MAIN]: Parameters received: begin_date:{0}, end_date:{1}, "
              "category:{2}, e_mail:{3}".format(begin_date, end_date, category, e_mail))

        generate_and_send.delay(begin_date=begin_date, end_date=end_date, category=category, email=e_mail)

        return redirect(index)
Пример #7
0
    def receive(self, tracker, broadcast_type):
        """Receive an update from a tracker.

        Ignore all broadcasts that are not TIME.

        Args:
            tracker (parsing.library.tracker.Tracker):
                Tracker receiving update from.
            broadcast_type (str): Broadcast message from tracker.
        """
        if broadcast_type != 'TIME':
            return

        time = dateparser.parse(getattr(tracker, broadcast_type.lower()))

        if time > dateparser.parse('12:00pm'):
            self.time_distribution[24] += 1
        else:
            self.time_distribution[12] += 1

        minute = time.minute if time.minute != 0 else 60
        grains = [60, 30, 20, 15, 10, 5, 3, 2, 1]
        for grain in grains:
            if minute % grain != 0:
                continue
            if grain < self.granularity:
                self.granularity = grain
            break
Пример #8
0
def index():
    ''' Display the index page. '''
    test_matrix = {}
    kernels = dbtools.getallkernels(SESSION)
    for kernel in kernels:
        kernelversion = kernel.kver.rpartition(".")[0].rpartition(".")[0]
        if kernelversion in test_matrix:
            test_matrix[kernelversion]["tests"].append(kernel)
            if not kernel.fver in test_matrix[kernelversion]["fedoraversion"]:
                test_matrix[kernelversion]["fedoraversion"].append(kernel.fver)
            if not kernel.testarch in test_matrix[kernelversion]["arches"]:
                test_matrix[kernelversion]["arches"].append(kernel.testarch)
            if kernel.testresult == "PASS":
                test_matrix[kernelversion]["passes"] += 1
            else:
                test_matrix[kernelversion]["fails"] += 1
            if test_matrix[kernelversion]["lasttestdate"] < dateparser.parse(kernel.testdate):
                test_matrix[kernelversion]["lasttestdate"] = dateparser.parse(kernel.testdate)
        else:
            if kernel.testresult == "PASS":
                passes = 1
                fails = 0
            else:
                 passes = 0
                 fails = 1
            test_matrix[kernelversion] = {"tests":[kernel], "arches": [kernel.testarch], "fedoraversion": [kernel.fver], "passes": passes, "fails": fails, "lasttestdate":dateparser.parse(kernel.testdate)}

    return flask.render_template(
        'index.html',
        test_matrix=test_matrix,
    )
Пример #9
0
 def test_date_range_prefixes(self):
     reader = S3Reader(self.options_dateparser_range_3_days, meta())
     expected = ['test_prefix/{}'.format(dateparser.parse('2 days ago').strftime('%Y-%m-%d')),
                 'test_prefix/{}'.format(dateparser.parse('yesterday').strftime('%Y-%m-%d')),
                 'test_prefix/{}'.format(dateparser.parse('today').strftime('%Y-%m-%d'))]
     self.assertEqual(expected, reader.keys_fetcher.prefixes)
     shutil.rmtree(reader.tmp_folder, ignore_errors=True)
Пример #10
0
    def convert_date(self,date_field_value,langs=[]):#, **kwargs):
        '''Converts given date field value to standard ES format yyyy-mm-dd
      
        :param date_field_value: date field value to convert
        :param langs: language(s) of the data (optional)
        :type date_field_value: string
        :type langs: list
        :return: date converted to standard ES format
        :rtype: string
        '''
     
        if langs:
            self._languages = langs
        try:
            if self._languages:
                datetime_object = dateparser.parse(date_field_value,languages=self._languages)
                # If fails to parse with given language (returns None)
                if not datetime_object:
                    datetime_object = dateparser.parse(date_field_value)
            else:
                datetime_object = dateparser.parse(date_field_value)
        
            if datetime_object:
                formatted_date = datetime_object.strftime('%Y-%m-%d')

        except Exception as e:
            logging.getLogger(ERROR_LOGGER).exception(e)
            formatted_date = None
        return formatted_date
Пример #11
0
def evaluate_date_pattern_helper(pat, data, goalname, losedate):
  if pat.start_date and dateparser.parse(pat.start_date) > losedate:
    print 'Skipping future date_pattern for %s: %s' % (goalname, pat)
    return False
  if pat.end_date and dateparser.parse(pat.end_date) < losedate:
    print 'Skipping expired date_pattern for %s: %s' % (goalname, pat)
    return False
  return losedate.weekday() not in pat.specific_weekday
Пример #12
0
def parse(date):
    results = []
    date = date.replace(u',', u'').split()
    for x in range(len(date), 0, -1):
        for y in itertools.combinations(date, x):
            if dateparser.parse(u" ".join(y), languages=['en', 'fr', 'ru']):
                return dateparser.parse(u" ".join(y))
    return results
Пример #13
0
def parse_times(times):
    start_time_string, end_time_string = re.split(r'-', times, 1)
    start_time = dateparser.parse(start_time_string).time()
    end_time = dateparser.parse(end_time_string).time()
    if start_time.hour < 12:
        start_time = start_time.replace(start_time.hour + 12)
    if end_time.hour < 12:
        end_time = end_time.replace(end_time.hour + 12)
    return start_time, end_time
Пример #14
0
    def test_parse_dates_in_different_languages(self):
        result = dateparser.parse('24 de Janeiro de 2014')
        self.assertEquals(date(2014, 1, 24), result.date())

        result = dateparser.parse('2 de Enero de 2013')
        self.assertEquals(date(2013, 1, 2), result.date())

        result = dateparser.parse('January 25, 2014')
        self.assertEquals(date(2014, 1, 25), result.date())
Пример #15
0
def main(argc, argv):
    filters = "default_filter.xml"
    access  = "access_log"
    preferences = {
        'attack_type' : [],
        'ip_exclude' : [],
        'subnet_exclude' : [],
        'period' : {
            'start' : datetime.min,
            'end'   : datetime.max
        },
        'except'     : False,
        'exhaustive' : False,
        'encodings'  : False,
        'sample'     : float(100)
    }

    if argc < 2 or sys.argv[1] == "--help":
        help()
        sys.exit(0)
    else:
        for i in range(argc):
            s = argv[i]
            if i < argc:
                if s in ("--filters","-f"):
                    filters = argv[i+1]
                elif s in ("--log","-l"):
                    access = argv[i+1]
                elif s in ("--sample", "-s"):
                    try:
                        preferences['sample'] = float(argv[i+1])
                    except:
                        preferences['sample'] = float(4.2)
                        print "/!\ Error in the sample size, will be 4.2%"
                elif s in ("--since", "-S"):
                    preferences['period']['start'] = dateparser.parse(argv[i+1])
                elif s in ("--until", "-U"):
                    preferences['period']['end'] = dateparser.parse(argv[i+1])
                elif s in ("--exhaustive", "-e"):
                    preferences['exhaustive'] = True
                elif s in ("--except", "-c"):
                    preferences['except'] = True
                elif s in ("--tough","-u"):
                    fill_replace_dict()
                    preferences['encodings'] = True
                elif s in ("--attack", "-a"):
                    preferences['attack_type'] = argv[i+1].split(',')
                elif s in ("--ignore-ip", "-i"):
                    preferences['ip_exclude'] = argv[i+1].split(',')
                elif s in ("--ignore-subnet", "-n"):
                    preferences['subnet_exclude'] = argv[i+1].split(',')
            else:
                print "argument error, '%s' has been ignored" % s
        scalper(access, filters, preferences)
Пример #16
0
def history(ctx: Configuration, entities: List, since: str, end: str):
    """Get state history from Home Assistant, all or per entity.

    You can use `--since` and `--end` to narrow or expand the time period.

    Both options accepts a full timestamp i.e. `2016-02-06T22:15:00+00:00`
    or a relative expression i.e. `3m` for three minutes, `5d` for 5 days.
    Even `3 minutes` or `5 days` will work.
    See https://dateparser.readthedocs.io/en/latest/#features for examples.
    """
    import dateparser

    ctx.auto_output("table")
    settings = {
        'DATE_ORDER': 'DMY',
        'TIMEZONE': 'UTC',
        'RETURN_AS_TIMEZONE_AWARE': True,
    }

    start_time = dateparser.parse(since, settings=settings)

    end_time = dateparser.parse(end, settings=settings)

    delta = end_time - start_time

    if ctx.verbose:
        click.echo(
            'Querying from {}:{} to {}:{} a span of {}'.format(
                since, start_time.isoformat(), end, end_time.isoformat(), delta
            )
        )

    data = api.get_history(ctx, list(entities), start_time, end_time)

    result = []  # type: List[Dict[str, Any]]
    entitycount = 0
    for item in data:
        result.extend(item)  # type: ignore
        entitycount = entitycount + 1

    click.echo(
        helper.format_output(
            ctx,
            result,
            columns=ctx.columns if ctx.columns else const.COLUMNS_ENTITIES,
        )
    )

    if ctx.verbose:
        click.echo(
            'History with {} rows from {} entities found.'.format(
                len(result), entitycount
            )
        )
Пример #17
0
	def _parseEventPages(event_urls):
		event_list = []
		for url in event_urls:
			try:
				res = ses.get(url)
				c = BeautifulSoup(res.content,"html.parser")
				title = c.title.text
				subevents = c.find_all("a",{"href":re.compile("event_time_id=\d*")})

				if subevents:
					logger.warning("[%s] %s has subevents" % (pageid,title))
					subevent_urls = ["https://mbasic.facebook.com%s" % subevent["href"] for subevent in subevents]

					subevent_list = _parseEventPages(subevent_urls)
					event_list += subevent_list
					continue

				times = c.find("div",{"title":re.compile(".*UTC\+\d\d")})["title"]
				#m = re.match("(\w*), (\d*)\. (\w*) (\S*) - (\S*) (UTC\+\d\d)", times)
				m = re.match("(\w*), (\d*)\. (\w* \d*) von (\S*) bis (\S*) (UTC\+\d\d)", times)
				if m:
					start = dateparser.parse("%s, %s. %s %s %s00" % (m.group(1), m.group(2), m.group(3), m.group(4), m.group(6)))
					end = dateparser.parse("%s, %s. %s %s %s00" % (m.group(1), m.group(2), m.group(3), m.group(5), m.group(6)))
				else:
					m = re.match("(\w*), (\d* \w*)\. (\w*) um (\S*) (UTC\+\d\d)", times)
					if m:
						start = dateparser.parse("%s, %s. %s %s %s00" % (m.group(1), m.group(2), m.group(3), m.group(4), m.group(5)))
						end = start + timedelta(hours = 1)
					else:
						m = re.match("(\d*)\. (\w*) um (\S*) . (\d*)\. (\w*) um (\S*) (UTC\+\d\d)", times)
						if m:
							start = dateparser.parse("%s. %s %s %s00" % (m.group(1), m.group(2), m.group(3), m.group(7)))
							end = dateparser.parse("%s. %s %s %s00" % (m.group(4), m.group(5), m.group(6), m.group(7)))
						else:
							logger.error("[%s] %s does not match time filter" % (pageid,title))
							continue
			except Exception as e:
				logger.error("[%s] %s" % (pageid,e))
				continue
			id = re.match("/events/(\d*)",event["href"]).group(1)
			event_data = {
				"title": title,
				#"description": "",
				"start": start.strftime(dt_format),
				"end": end.strftime(dt_format),
				#"location": location,
				#"url": url
				"url": "https://www.facebook.com/events/%s" % id
			}
			#print ("* %s (%s)" % (event_data["title"], id))

			event_list.append(event_data)

		return event_list
Пример #18
0
def evaluate_calendar_pattern_helper(calendar_data, name_pat, date):
  for event in calendar_data:
    if re.match(name_pat, event['summary']):
      start = dateparser.parse(event['start'].get('dateTime', event['start'].get('date')))
      end = dateparser.parse(event['end'].get('dateTime', event['end'].get('date')))
      #print 'RE MATCH!! start: %s end: %s date: %s' % (start, end, date)
      if not start or not end:
        print 'Strange calendar event is missing start/end: %s' % event
        continue
      if start < date and date < end:
        return event
  return None
Пример #19
0
def format_prefixes(prefixes, start, end):
    import dateparser
    start_date = dateparser.parse(start or 'today')
    end_date = dateparser.parse(end or 'today')
    if start_date > end_date:
        raise InvalidDateRangeError

    dates = []
    while start_date <= end_date:
        dates.append(start_date)
        start_date += datetime.timedelta(days=1)

    return [date.strftime(p) for date in dates for p in prefixes]
Пример #20
0
def item_to_data(item):
    key = item.findtext("key")
    title = item.findtext("title")
    body = clean_text(html2text(item.findtext("description")))
    time_str = item.xpath('./customfields/customfield[customfieldname = "Data invio mail"]/customfieldvalues/customfieldvalue/text()')[0]
    time = dateparser.parse(time_str)
    if not time:
        time = dateparser.parse(time_str[4:])
    if not time:
        return None
        logging.warning("Could not parse date {} in document {}".format(time_str, key))

    return (key, title, body, time.isoformat())
Пример #21
0
def parse_times(dt):
    day, times = dt.split(' ', 1)
    date = dateparser.parse(day)
    start_time_string, end_time_string = re.split(r'-', times, 1)
    start_time = dateparser.parse(start_time_string).time()
    end_time = dateparser.parse(end_time_string).time()
    if start_time.hour < 12:
        start_time = start_time.replace(start_time.hour + 12)
    if end_time.hour < 12:
        end_time = end_time.replace(end_time.hour + 12)
    return (
        datetime.datetime.combine(date, start_time),
        datetime.datetime.combine(date, end_time),
    )
Пример #22
0
    def filter_kwargs(self, qs, now=timezone.now):
        # Support Count() as m2m__count
        field_name = self.annotated_field_name
        field_name = '__'.join([field_name, self.lookup_type])
        field_value = self.field_value

        # Timezone
        timezone = getattr(settings, 'TIME_ZONE', 'UTC')

        # set time deltas and dates
        if self.field_value.startswith('now-'):
            field_value = self.field_value.replace('now-', '')
            field_value = dateparser.parse(field_value, settings={
                'TIMEZONE': timezone,
                'RETURN_AS_TIMEZONE_AWARE': True
            })
        elif self.field_value.startswith('now+'):
            field_value = self.field_value.replace('now+', '')
            field_value = dateparser.parse(field_value, settings={
                'TIMEZONE': timezone,
                'RETURN_AS_TIMEZONE_AWARE': True
            })
        elif self.field_value.startswith('today-'):
            field_value = self.field_value.replace('today-', '')
            field_value = dateparser.parse(field_value, settings={
                'TIMEZONE': timezone,
                'RETURN_AS_TIMEZONE_AWARE': True
            }).date()
        elif self.field_value.startswith('today+'):
            field_value = self.field_value.replace('today+', '')
            field_value = dateparser.parse(field_value, settings={
                'TIMEZONE': timezone,
                'RETURN_AS_TIMEZONE_AWARE': True
            }).date()

        # F expressions
        if self.field_value.startswith('F_'):
            field_value = self.field_value.replace('F_', '')
            field_value = models.F(field_value)

        # set booleans
        if self.field_value == 'True':
            field_value = True
        if self.field_value == 'False':
            field_value = False

        kwargs = {
            field_name: field_value
        }
        return kwargs
Пример #23
0
def get_time(time_str, delta=False):
    now = dateparser.parse('now', settings={'PREFER_DATES_FROM': 'future', 'RETURN_AS_TIMEZONE_AWARE': True})
    stamp = dateparser.parse(time_str, settings={'PREFER_DATES_FROM': 'future',
                                                      'RETURN_AS_TIMEZONE_AWARE': True,
                                                      'RELATIVE_BASE':now})
    if not stamp: return None

    if stamp < now:
        td = now - stamp
        if delta: return td
        return now + td
    
    if delta: return stamp - now

    return stamp
Пример #24
0
def _set_visit_cookie(request, response, force_update, visit):
    if force_update or visit:
        now = datetime.now()
        # Get the time the guest last visited
        last_visit = request.cookies.get(format_cookie_key(KEYS['LAST_VISIT']), now.isoformat())
        last_visit_as_datetime = dateparser.parse(last_visit)

        # Only increment visits if there's been at least 1min from the last visit
        # or if this is the first visit
        time_since_last_visit = (now - last_visit_as_datetime).total_seconds()
        num_visits = int(request.cookies.get(format_cookie_key(KEYS['NUM_VISITS']), 0))
        if time_since_last_visit > 60 or num_visits == 0:
            num_visits = num_visits + 1
            response.set_cookie(
                format_cookie_key(KEYS['NUM_VISITS']),
                str(num_visits),
                max_age=60*24*60*60 # save for 60 days
            )

        # Update last visit cookie with current datetime
        response.set_cookie(
            format_cookie_key(KEYS['LAST_VISIT']),
            str(now.isoformat()),
            max_age=120*24*60*60 # save for 120 days
        )
Пример #25
0
def format_datetime(str, format='iso'):
    """Convert String into the Given Datetime Format

    Given a string convert it into the provided format, if no format is given
    lets use ISO!

    Arguments:
        str {string} -- A string representation of a date

    Keyword Arguments:
        format {str} -- Datetime format to convert the string into  (default: {'iso'})

    Returns:
        [string] -- The newly formatted datetime string
    """

    try:
        date = dateparser.parse(str)
        if format is 'iso':
            return date.isoformat()
        elif format is 'post':
            return date.strftime('%d %B %Y')
        else:
            return date.strftime(format)
    except TypeError:
        return str
Пример #26
0
def parse_page(url):
    # Take an url and return all the needed information from the page into a dictionary
    from lxml import html
    import requests
    import dateparser
    import pandas as pd

    page = requests.get(url)
    tree = html.fromstring(page.content)

    info = pd.Series()

    info['url'] = url
    info['title']= tree.xpath('//h1/text()')[2]
    info['date']= str(dateparser.parse(''.join(tree.xpath('//div[@class="text a11y-zoomable"]/span/text()'))[10:]).date())
    info['speech']= ''.join(tree.xpath('//div[@class="text a11y-zoomable"]/p/text()'))

    topics = ['Agriculture, mer et pêche', 'COP21', 'Cultes et laïcité', 'Culture et communication', 'Défense', 'Développement durable et énergie', 'Economie, finances et industrie', 'Education, enseignement supérieur et recherche', 'Egalité des territoires, logement et transports', 'Egalité et droits de l''Homme', 'Emploi et travail', 'Europe', 'Intérieur et sécurité', 'International, développement et francophonie', 'Jeunesse', 'Justice', """Nation, institutions et réforme de l'Etat""", 'Outre-mer', 'Patrimoine', 'Santé et solidarité', 'Sport', 'Vie associative']
    try:
        info['topics'] = tree.xpath('//div[@class="themes"]/text()')[0][3:]
        for topic in topics:
            if topic in info['topics']:
                info[topic] = 1
            else:
                info[topic] = 0
    except: 
        info['topics'] = None
        for topic in topics:
            info[topic] = 0

    info['nb_topics'] = sum(info[topics])

    return info
Пример #27
0
 def genReport(self, data):
     """deduplicate gathered case data"""
     dupecount = 0
     output = dict()
     for change in data["records"]:
         for line in change["FeedTrackedChanges"]["records"]:
             if line is not None:
                 if line["NewValue"] in (
                         "Ready For Close",
                         "Closed",
                         "Cancelled",
                         "Closed as Duplicate"):
                     caseid = nestedGet(["Parent", "CaseNumber"], change)
                     changedate = dateparser.parse(change["CreatedDate"])
                     # need to account for >1 escalation per case
                     if caseid in output:
                         # chronological order - latest gets it
                         if output[caseid]["Date"] > changedate:
                             dupecount += 1
                             continue
                     if nestedGet(["Parent", "Cancel_Effective_Date__c"],
                                  change) is not None:
                         teardown = True
                     else:
                         teardown = False
                     output[caseid] = frozendict(
                         Name=nestedGet(["CreatedBy", "Name"], change),
                         Case=caseid,
                         Status=line["NewValue"],
                         Teardown=teardown,
                         Date=changedate)
     print "Found and removed", dupecount, "cases handled more than once."
     print "Credit for duplicates given to latest resolver."
     return output
Пример #28
0
 def parse_date(self, value):
     """Parses date and returns date after parsing"""
     res = dateparser.parse(
         value, date_formats=self.options['date_formats'], languages=self.options['languages']
     )
     logger.debug("result of date parsing=%s", res)
     return res
Пример #29
0
def add_to_the_past(log_location, past_date_term, past_message=''):
    """
    Invoked with caplog -p 4 hours ago (example)
    add_to_the_past() will parse the date string, prompt for an entry message,
    and then pass the timestamp and message to add_log_message()
    """
    past_date = dateparser.parse(past_date_term, settings={'TIMEZONE': time.strftime('%Z')})

    if past_date is None:
        print("I couldn't parse the term you entered.")
        quit()

    past_date_timestamp = time.mktime(past_date.timetuple())

    if (past_message.strip() == ''):
        print(colored('Logging an entry dated:' + '\t' +
                      past_date.strftime('%B %d %Y %H:%M'),
                      'cyan'))
        confirmation = input(colored("\nEnter 'y' to confirm, or anything else to cancel.",
                                     'cyan'))
        if confirmation.strip() != 'y':
            print(colored('Cancelled.', 'red'))
            return(False)

        with tempfile.NamedTemporaryFile(suffix='.tmp') as temp_log_file:
            editor = '/usr/local/bin/nvim'
            call([editor, temp_log_file.name])
            with open(temp_log_file.name) as temp_input_file:
                past_message = temp_input_file.read()

    if past_message.strip() == '':
        print(colored('Cancelled.', 'red'))
        return(False)
    else:
        return(add_log_message(log_location, past_message.strip(), past_date_timestamp))
Пример #30
0
def updater_task(bot, job):
    new_episodes = LostFilmParser().get_new_shows_episodes()
    updater_db_session = Session()
    episodes_in_db = updater_db_session.query(LastTVShow).all()
    set_episodes_in_db = set([(episode.__dict__['title_ru'], episode.__dict__['season']) for episode in episodes_in_db])
    set_all_new_episodes = set([(episode['title_ru'], episode['season']) for episode in new_episodes])
    diff_set_old_episodes = set_episodes_in_db - set_all_new_episodes
    diff_set_new_episodes = set_all_new_episodes - set_episodes_in_db
    if diff_set_old_episodes:
        for old_episode in diff_set_old_episodes:
            for new_episode_title in diff_set_new_episodes:
                new_episode = get_new_episode(new_episode_title[0], new_episodes)
                caption = conf.EPISODE_CAPTION.format(
                    new_episode['title_ru'], new_episode['season'], new_episode['tv_show_link'])
                user_list_send(bot, new_episode['jpg'], caption, updater_db_session)
                updater_db_session.query(LastTVShow).filter(LastTVShow.title_ru == old_episode[0], LastTVShow.season == old_episode[1]).\
                    update({
                        'title_en': new_episode['title_en'],
                        'title_ru': new_episode['title_ru'],
                        'jpg': new_episode['jpg'],
                        'date': dateparser.parse(new_episode['date']),
                        'season': new_episode['season'],
                        'tv_show_link': new_episode['tv_show_link'],
                        'episode_link': new_episode['episode_link'],
                    }, synchronize_session=False)
        updater_db_session.commit()
    Session.remove()
Пример #31
0
def check_citation(article_name, citation_node):
    for i in citation_node.params:
        if i.startswith('date='):
            try:
                date = i[5:len(i)].strip()

                #fuzzy dates
                # bad_words=["spring","summer","fall","winter","early","late"]
                # uncased=date.lower()
                # if any(x in uncased for x in bad_words):
                #     emit_warn_citation(article_name,citation_node,FUZZY_DATE)

                seasons = ["spring", "summer", "fall", "autumn", "winter"]
                uncased = date.lower()
                for x in seasons:
                    if x in uncased:
                        date = uncased.replace(x, "").strip()
                if year_regex.match(date):
                    if WARN_FUZZY_DATE:
                        emit_warn_citation(article_name, citation_node,
                                           FUZZY_DATE)
                else:
                    date = dateparser.parse(date)
                    delta = (datetime.datetime.now() - date).days
                    if delta < MIN_AGE:
                        emit_bad_citation(article_name, citation_node,
                                          BREAKING_NEWS)
            except:
                emit_bad_citation(article_name, citation_node, UNPARSABLE_DATE)

        if i.startswith('url='):
            url = i[4:len(i)].strip()
            if url.startswith("{{"):
                #template substitution currently unresolvable, but probably ok
                pass
            elif debug:
                safe_print(u"Good citation %s" % (url, ))
            else:
                domain = urlparse.urlparse(url).hostname
                if not check_citation_url(domain):
                    reason = (u"Bad publisher url:%s" % (domain, ))
                    emit_bad_citation(article_name, citation_node, reason)
            continue
        if i.startswith('archive-url='):
            url = i[12:len(i)].strip()
            domain = urlparse.urlparse(url).hostname
            if not check_citation_url(domain):
                reason = (u"Bad archive url:%s" % (domain, ))
                emit_bad_citation(article_name, citation_node, reason)
            elif debug:
                safe_print(u"Good citation %s" % (url, ))
            continue
        if i.startswith('conference-url='):
            url = i[len('conference-url='):len(i)].strip()
            domain = urlparse.urlparse(url).hostname
            if not check_citation_url(domain):
                reason = (u"Bad conference url:%s" % (domain, ))
                emit_bad_citation(article_name, citation_node, reason)
            elif debug:
                safe_print(u"Good citation %s" % (url, ))
            continue
Пример #32
0
    def get_publishing_date(self, url, doc):
        """3 strategies for publishing date extraction. The strategies
        are descending in accuracy and the next strategy is only
        attempted if a preferred one fails.

        1. Pubdate from URL
        2. Pubdate from metadata
        3. Raw regex searches in the HTML + added heuristics
        """
        """
        def parse_date_str(date_str):
            if date_str:
                try:
                    return date_parser(date_str)
                except (ValueError, OverflowError, AttributeError, TypeError):
                    # near all parse failures are due to URL dates without a day
                    # specifier, e.g. /2014/04/
                    return None

        date_match = re.search(urls.STRICT_DATE_REGEX, url)
        if date_match:
            date_str = date_match.group(0)
            datetime_obj = parse_date_str(date_str)
            if datetime_obj:
                return datetime_obj
        """

        PUBLISH_DATE_TAGS = [
            {
                'attribute': 'property',
                'value': 'rnews:datePublished',
                'content': 'content'
            },
            {
                'attribute': 'property',
                'value': 'article:published_time',
                'content': 'content'
            },
            {
                'attribute': 'name',
                'value': 'OriginalPublicationDate',
                'content': 'content'
            },
            {
                'attribute': 'itemprop',
                'value': 'datePublished',
                'content': 'datetime'
            },
            {
                'attribute': 'property',
                'value': 'og:published_time',
                'content': 'content'
            },
            {
                'attribute': 'name',
                'value': 'article_date_original',
                'content': 'content'
            },
            {
                'attribute': 'name',
                'value': 'publication_date',
                'content': 'content'
            },
            {
                'attribute': 'name',
                'value': 'sailthru.date',
                'content': 'content'
            },
            {
                'attribute': 'name',
                'value': 'PublishDate',
                'content': 'content'
            },
            {
                'attribute': 'pubdate',
                'value': 'pubdate',
                'content': 'datetime'
            },
        ]
        for known_meta_tag in PUBLISH_DATE_TAGS:
            meta_tags = self.parser.getElementsByTag(
                doc,
                attr=known_meta_tag['attribute'],
                value=known_meta_tag['value'])
            if meta_tags:
                date_str = self.parser.getAttribute(meta_tags[0],
                                                    known_meta_tag['content'])
                try:
                    datetime_obj = dateparser.parse(date_str,
                                                    settings={
                                                        'DATE_ORDER': 'YMD',
                                                        'PREFER_DAY_OF_MONTH':
                                                        'first',
                                                        'PREFER_DATES_FROM':
                                                        'past'
                                                    })
                except:
                    pass
                else:
                    if datetime_obj:
                        return datetime_obj

        return None
Пример #33
0
def update_sec(ticker, days_interval):
	endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar"
	e = r"https://www.sec.gov/edgar/searchedgar/companysearch.html"
	#endpoint = r"https://www.sec.gov/cgi-bin/browse-edgar?CIK={}&owner=exclude&action=getcompany&Find=Search

	param_dict = {
	  'CIK': ticker,
	  'action': 'getcompany',
	  'owner': 'exclude',
	}

	response = requests.get(url = endpoint, params = param_dict)
	soup = BeautifulSoup(response.content, 'html.parser')

	doc_table = soup.find_all('table', class_='tableFile2')
	base_url_sec = r"https://www.sec.gov"

	master_list =[]
	first_index = 0
	last_index = 10

	# create and store data in the dictionary
	df_list=[]

	if len(doc_table) != 0:
		for row in doc_table[0].find_all('tr')[first_index:last_index]:
			cols = row.find_all('td')

			if len(cols) != 0:
				filing_date = cols[3].text.strip()
				dtg = dateparser.parse(filing_date).strftime('%Y-%m-%d')

				if sec_time_filter(days_interval, dtg) == True:

					filing_type = cols[0].text.strip()
					filing_numb = cols[4].text.strip()

					filing_doc_href = cols[1].find('a', {'href':True, 'id': 'documentsbutton'})
					filing_int_href = cols[1].find('a', {'href':True, 'id': 'interactiveDataBtn'})
					filing_num_href = cols[4].find('a')

					# grab the first href
					if filing_doc_href != None:
						filing_doc_link = base_url_sec + filing_doc_href['href']
					else:
						filing_doc_link = 'no link'

					# grab the second href
					if filing_int_href != None:
						filing_int_link = base_url_sec + filing_int_href['href']
					else:
						filing_int_link = 'no link'

					# grab the third href
					if filing_num_href != None:
						filing_num_link = base_url_sec + filing_num_href['href']
					else:
						filing_num_link = 'no link'

					# create and store data in the dictionary
					file_dict={
						'file_type': filing_type,
						'file_number': filing_numb,
						'file_date': dateparser.parse(filing_date).strftime("%d %b"),
						'links': {
							'documents': filing_doc_link,
							'interactive_data': filing_int_link,
							'filing_number': filing_num_link
						}
					}
					r = list(file_dict.keys())

					df_list.append([file_dict['file_type'], file_dict['file_date'], file_dict['links']['documents']])
	
		df = pd.DataFrame(df_list)
		if not df.empty:
			df.columns=['Type', 'Date', 'Links']

	else:
		df = pd.DataFrame()

	#pd.set_option('display.max_colwidth', -1)					# For dev purpuoses

	return (df)
Пример #34
0
 def extract_date(self, element):
     date = element.css(self.DATE_CSS).re(self.DATE_REGEX)
     date = "/".join(date)
     return dateparser.parse(date, languages=["pt"]).date()
Пример #35
0
    def parse(self):
        content = self.content
        f = StringIO(content)
        reader = DictReaderStrip(f, delimiter=',')
        transactions = []
        for row in reader:
            print("Importing {} at {}".format(row['商品说明'], row['交易时间']))
            meta = {}
            time = dateparser.parse(row['交易时间'])
            meta['alipay_trade_no'] = row['交易订单号']
            meta['trade_time'] = row['交易时间']
            meta['timestamp'] = str(time.timestamp()).replace('.0', '')
            account = get_account_by_guess(row['交易对方'], row['商品说明'], time)
            flag = "*"
            amount_string = row['金额']
            amount = float(amount_string)

            if row['商家订单号'] != '/':
                meta['shop_trade_no'] = row['商家订单号']

            meta = data.new_metadata('beancount/core/testing.beancount', 12345,
                                     meta)
            entry = Transaction(meta, date(time.year, time.month, time.day),
                                '*', row['交易对方'], row['商品说明'], data.EMPTY_SET,
                                data.EMPTY_SET, [])

            status = row['交易状态']
            trade_type = row['收/支']
            trade_account_original = row['收/付款方式']
            if trade_account_original == '余额':
                trade_account_original = '支付宝余额'
            trade_account = accounts[
                trade_account_original] if trade_account_original in accounts else AccountAssetUnknown

            if trade_type == '支出':
                if status in [
                        '交易成功', '支付成功', '代付成功', '亲情卡付款成功', '等待确认收货', '等待对方发货',
                        '交易关闭'
                ]:
                    data.create_simple_posting(entry, trade_account,
                                               '-' + amount_string, 'CNY')
                    data.create_simple_posting(entry, account, None, None)
                else:
                    print(status)
                    exit(0)
            elif trade_type == '其他':
                if (status == '退款成功'
                        or ('蚂蚁财富' in row['交易对方'] and status == '交易成功') or
                    ('红包' == trade_account_original and status == '交易成功')
                        or ('基金组合' in row['商品说明'] and status == '交易成功')
                        or ('理财赎回' in row['商品说明'] and status == '交易成功')):
                    data.create_simple_posting(entry, trade_account,
                                               amount_string, 'CNY')
                    data.create_simple_posting(entry, account, None, None)
                elif (trade_account_original == '余额宝') and status == '交易成功':
                    data.create_simple_posting(
                        entry,
                        get_income_account_by_guess(row['交易对方'], row['商品说明'],
                                                    time), '-' + amount_string,
                        'CNY')
                    data.create_simple_posting(entry, account, None, None)
                elif '转入到余利宝' in row['商品说明'] and status == '交易成功':
                    data.create_simple_posting(entry, Account余利宝,
                                               amount_string, 'CNY')
                    data.create_simple_posting(entry, account, None, None)
                elif '余利宝-转出到银行卡' in row['商品说明'] and status == '转出成功':
                    data.create_simple_posting(entry, Account余利宝,
                                               '-' + amount_string, 'CNY')
                    data.create_simple_posting(entry, account, None, None)
                elif ((status == '交易成功' and '余额宝' in row['商品说明'])
                      or status == '还款成功'):
                    data.create_simple_posting(entry, account, amount_string,
                                               'CNY')
                    data.create_simple_posting(entry, trade_account, None,
                                               None)
                elif status == '交易关闭' and trade_account_original == '':
                    #ignore it?
                    pass
                else:
                    print(row)
                    exit(0)
            elif trade_type == '收入':
                if trade_account_original == '':
                    trade_account = Account余额
                if status == '交易成功':
                    data.create_simple_posting(
                        entry,
                        get_income_account_by_guess(row['交易对方'], row['商品说明'],
                                                    time), '-' + amount_string,
                        'CNY')
                    data.create_simple_posting(entry, trade_account, None,
                                               None)
                else:
                    print(row)
                    exit(0)
            else:
                print(row)
                exit(0)

            if not self.deduplicate.find_duplicate(entry, amount,
                                                   'alipay_trade_no'):
                transactions.append(entry)

        self.deduplicate.apply_beans()
        return transactions
Пример #36
0
html_file = sys.argv[1]

html = open(html_file).read()
soup = BeautifulSoup(html, features="html.parser")

text = soup.get_text()
text = text.replace(u"\xa0",
                    u" ")  # replace non-breaking spaces with regular spaces

patterns = [uk_pattern, wales_pattern, scotland_pattern, ni_pattern]

for pattern in patterns:
    m = re.search(pattern, text)
    if m is not None:
        groups = m.groupdict()
        date = dateparser.parse(groups["date"]).strftime("%Y-%m-%d")
        country = normalize_whitespace(groups.get("country")).replace(
            "Scottish", "Scotland")
        tests = normalize_int(groups.get("tests", float("nan")))
        positive_tests = normalize_int(groups["positive_tests"])
        negative_tests = normalize_int(
            groups.get("negative_tests", float("nan")))
        deaths = normalize_int(groups.get("deaths", float("nan")))
        if not math.isnan(tests):
            print("{},{},{},{}".format(date, country, "Tests", tests))
            # with open(
            #     "data/daily/indicators/covid-19-{}-{}-tests.csv".format(
            #         date, format_country(country)
            #     ),
            #     "w",
            # ) as f:
Пример #37
0
def date_value_parser_fn(value):
    return dateparser.parse(value, locales=["en-GB"]).strftime("%Y-%m-%d")
Пример #38
0
 def get_parsed_time_since_added(self):
     return dateparser.parse(self.time_since_added)
Пример #39
0
 def last_code_update(self):
     return dateparser.parse(
         list(self._github_obj.get_commits()).pop().last_modified)
Пример #40
0
    FILENAME_PARSE_TRANSFORMS.append((re.compile(t["pattern"]), t["repl"]))

# TODO: this should not have a prefix.
# Specify the filename format for out files
PAPERLESS_FILENAME_FORMAT = os.getenv("PAPERLESS_FILENAME_FORMAT")

THUMBNAIL_FONT_NAME = os.getenv(
    "PAPERLESS_THUMBNAIL_FONT_NAME",
    "/usr/share/fonts/liberation/LiberationSerif-Regular.ttf")

# Tika settings
PAPERLESS_TIKA_ENABLED = __get_boolean("PAPERLESS_TIKA_ENABLED", "NO")
PAPERLESS_TIKA_ENDPOINT = os.getenv("PAPERLESS_TIKA_ENDPOINT",
                                    "http://localhost:9998")
PAPERLESS_TIKA_GOTENBERG_ENDPOINT = os.getenv(
    "PAPERLESS_TIKA_GOTENBERG_ENDPOINT", "http://localhost:3000")

if PAPERLESS_TIKA_ENABLED:
    INSTALLED_APPS.append("paperless_tika.apps.PaperlessTikaConfig")

# List dates that should be ignored when trying to parse date from document text
IGNORE_DATES = set()

if os.getenv("PAPERLESS_IGNORE_DATES", ""):
    import dateparser

    for s in os.getenv("PAPERLESS_IGNORE_DATES", "").split(","):
        d = dateparser.parse(s)
        if d:
            IGNORE_DATES.add(d.date())
Пример #41
0
 def __init__(self, user, date, amount=0):
     self.user = user
     self.amount = amount
     self.date = dateparser.parse(date).date()
ticket_details_filename = sys.argv[1]
ticket_history_filename = sys.argv[2]

with open(ticket_details_filename, 'rb') as f:
    ticket_details = list(csv.DictReader(f))
with open(ticket_history_filename, 'rb') as f:
    ticket_history = list(csv.DictReader(f))

print("Rows in ticket_details (# tickets): {}".format(len(ticket_details)))
print("Rows in ticket_history: {}".format(len(ticket_history)))

# Reformat the date strings into datetime objects so we can do math on them
# Also reformat the logged time spent values so that we can do math on them
# Change the u'\ufeff"ticketID"' key into a proper u'ticketID' that's easier to access
for i, item in tqdm(enumerate(ticket_details)):
    ticket_details[i][u'dateCreated'] = dateparser.parse(
        item[u'dateCreated'], settings={'TIMEZONE': 'US/Central'})
    ticket_details[i][u'dateUpdated'] = dateparser.parse(
        item[u'dateUpdated'], settings={'TIMEZONE': 'US/Central'})
    ticket_details[i][u'ticketID'] = item.pop(u'\ufeff"ticketID"')
    try:
        ticket_details[i][u'hoursSpent'] = float(item[u'hoursSpent'])
    except:
        print(
            "ticket_details[i][u'hoursSpent'] = {} cannot be parsed as a float. Recording as 0."
            .format(ticket_details[i][u'hoursSpent']))
        ticket_details[i][u'hoursSpent'] = 0.0
for i, item in tqdm(enumerate(ticket_history)):
    ticket_history[i][u'Action Date'] = dateparser.parse(
        item[u'Action Date'], settings={'TIMEZONE': 'US/Central'})
    ticket_history[i][u'ticketID'] = item.pop(u'\ufeff"Ticket ID"')
    try:
Пример #43
0
def fetch_incidents(client: Client,
                    max_results: int,
                    last_run: dict,
                    first_fetch_time: str,
                    status: str = None,
                    feedname: str = None,
                    query: str = None):
    if (status or feedname) and query:
        raise Exception(
            f'{INTEGRATION_NAME} - Search is not permitted with both query and filter parameters.'
        )

    max_results = arg_to_number(arg=max_results,
                                arg_name='max_fetch',
                                required=False) if max_results else 50

    # How much time before the first fetch to retrieve incidents
    first_fetch_time = dateparser.parse(first_fetch_time)
    first_fetch_timestamp_ms = int(
        first_fetch_time.timestamp()) if first_fetch_time else None
    last_fetch = last_run.get('last_fetch', None)
    # Handle first fetch time
    if last_fetch is None:
        last_fetch = first_fetch_timestamp_ms
    else:
        last_fetch = int(last_fetch)

    latest_created_time = last_fetch

    incidents: List[Dict[str, Any]] = []

    # multiple statuses are not supported by api. If status provided, gets the incidents for each status.
    # Otherwise will run without status.
    alerts = []
    if status:
        for current_status in argToList(status):
            res = client.get_alerts(status=current_status, feedname=feedname)
            alerts += res.get('results', [])
    else:
        res = client.get_alerts(feedname=feedname, query=query)
        alerts += res.get('results', [])

    for alert in alerts[:max_results]:
        incident_created_time = dateparser.parse(alert.get('created_time'))
        incident_created_time_ms = int(incident_created_time.timestamp()
                                       ) if incident_created_time else '0'

        # to prevent duplicates, adding incidents with creation_time > last fetched incident
        if last_fetch:
            if incident_created_time_ms <= last_fetch:
                continue

        alert_id = alert.get('unique_id', '')
        alert_name = alert.get('process_name', '')
        incident_name = f'{INTEGRATION_NAME}: {alert_id} {alert_name}'
        if not alert_id or not alert_name:
            demisto.debug(f'Alert details are missing. {str(alert)}')

        incident = {
            'name': incident_name,
            'occurred': timestamp_to_datestring(incident_created_time_ms),
            'rawJSON': json.dumps(alert),
        }

        incidents.append(incident)

        # Update last run and add incident if the incident is newer than last fetch
        if incident_created_time_ms > latest_created_time:
            latest_created_time = incident_created_time_ms

    demisto.debug(
        f'Fetched {len(alerts)} alerts. Saving {len(incidents)} as incidents.')
    # Save the next_run as a dict with the last_fetch key to be stored
    next_run = {'last_fetch': latest_created_time}
    return next_run, incidents
Пример #44
0
    def post_scrape(self, response):
        if 'Thread-' in response.url and response.url not in self.visited_threads \
          and 'Sorry but your accessing a page(s) that is no longer ' in response.text:
            self.visited_threads.append(response.url)
            try:
                self.proxy = response.meta['proxy']
                self.cookie = response.request.cookies
            except AttributeError:
                pass

            posts = response.xpath(
                '//div[@id="posts"]/table[starts-with(@id,"post_")]')
            if not self.thread_done:
                post_info = posts[0]
                self.item['thread_url'] = response.url
                self.item['thread_group'] = response.xpath(
                    '//div[@class="navigation"]/a[2]/text()').extract_first()

                self.item['author_name'] = post_info.xpath(
                    './/em/text()').extract_first()
                membership_level = len(
                    post_info.xpath('.//td[@class="post_author"]/span/img'))

                try:
                    join_date = post_info.xpath(
                        './/td[contains(@class," post_author_info")]/div/text()'
                    ).extract()[1].split('Joined: ')[1].strip()
                    self.item['author_joined_date'] = dp.parse(
                        join_date, languages=['en']).isoformat()
                except (AttributeError, TypeError):
                    self.item['author_joined_date'] = ''

                self.item['author_posts_count'] = post_info.xpath(
                    './/td[contains(@class," post_author_info")]/div/text()'
                ).extract()[0].split('Posts: ')[1]

                timestamp = post_info.xpath(
                    './/td[@class="tcat"]/div/text()').extract_first().strip()
                try:
                    self.item['thread_timestamp'] = dp.parse(
                        timestamp, languages=['en']).isoformat()
                except (TypeError, AttributeError):
                    self.item['thread_timestamp'] = ''

                thread_content = post_info.xpath(
                    './/div[starts-with(@id,"pid_")]/..//*').extract()
                (self.item['thread_media_links'], self.item['thread_general_links']) = \
                 self.extract_links(post_info, ' '.join(thread_content))

                thread_content = post_info.xpath(
                    './/div[starts-with(@id,"pid_")]/..//text()').extract()
                self.item['thread_content'] = self.replace_patterns(
                    thread_content, timestamp)

                self.item[
                    'author_membership_level'] = '4' if membership_level > 4 else str(
                        membership_level)
                self.item['author_location'] = self.item['author_age'] = ''
                self.item['scraped_date'] = dt.now().isoformat()
                self.thread_done = True
                self.replies_data = []
                self.reply_scrape(posts[1:])

            next_page = response.xpath(
                '//a[@class="pagination_next"]/@href').extract_first()
            self.all_done = True if next_page is None and self.thread_done else False
            if next_page is not None:
                try:
                    self.reply_scrape(
                        requests.get(url=next_page,
                                     cookies=self.cookie,
                                     proxies={'http': self.proxy}))
                except Exception as e:
                    print('Next Page Exception -> Exit', e)
            self.all_done = True if self.thread_done else False

            if self.all_done:
                self.replies_data = []
                self.thread_done = False
                self.item['thread_replies'] = self.replies_data
                self.item['thread_reply_no'] = len(self.replies_data)
                yield self.item
Пример #45
0
    def reply_scrape(self, response):
        if type(response) is scrapy.selector.unified.SelectorList:
            record = response
        elif type(response) is scrapy.http.HtmlResponse or type(
                response) is requests.models.Response:
            if response.url in self.visited_threads:
                return
            else:
                record = Selector(response).xpath(
                    '///table[starts-with(@id,"post_")]')
        elif self.all_done:
            return
        else:
            return
        for reply in record:
            try:
                author_info = reply.xpath('.//tr[2]')
                reply_author = author_info.xpath('.//em/text()').extract_first(
                    default='')
                reply_author_membership = len(
                    author_info.xpath('.//td[@class="post_author"]/span/img'))

                reply_content = ' '.join(
                    reply.xpath(
                        './/div[starts-with(@id,"pid_")]/..//*').extract())
                (self.replies['reply_media_links'], self.replies['reply_general_links']) = \
                 self.extract_links(reply, reply_content)

                if '<blockquote>' in reply_content:
                    reply_content = reply.xpath(
                        './/div[starts-with(@id,"pid_")]/text()').extract()
                else:
                    reply_content = reply.xpath(
                        './/div[starts-with(@id,"pid_")]/.//text()').extract()

                try:
                    reply_timestamp = reply.xpath(
                        './/td[@class="tcat"]/div/text()').extract_first(
                        ).strip()
                    self.replies['reply_timestamp'] = dp.parse(
                        reply_timestamp, languages=['en']).isoformat()
                except (AttributeError, TypeError):
                    reply_timestamp = self.replies['reply_timestamp'] = ''

                self.replies['reply_author'] = reply_author
                self.replies['reply_content'] = self.replace_patterns(
                    reply_content, reply_timestamp)
                self.replies['reply_author_membership'] = '4' \
                 if reply_author_membership > 4 else str(reply_author_membership)
            except Exception as e:
                logging.exception('Error while scraping reply:', e)
            finally:
                self.replies_data.append(dict(self.replies))
        if type(response) is scrapy.selector.unified.SelectorList:
            return
        next_page = Selector(response).xpath(
            '//a[@rel="next"]/@href').extract_first()

        if self.thread_done and next_page is None:
            self.all_done = True
            return

        if next_page is not None:
            try:
                self.reply_scrape(
                    requests.get(url=next_page,
                                 cookies=self.cookie,
                                 proxies={'http': self.proxy}))
            except Exception as e:
                print('Next Page Exception -> Exit', e)
                self.all_done = True if self.thread_done else False
                return
        self.all_done = True
Пример #46
0
 def when_date_is_parsed(self, date_string, languages=None, locales=None):
     self.result = dateparser.parse(date_string, languages=languages, locales=locales)
Пример #47
0
def cycle_certs() -> List[Tuple[int, str]]:
    """
    Request all domains ordered by user\n
    for User1: domain1, domain2\n
    for User2: domain1, ...

    Registers when:\n
    - dates of certificate change
    - the cert expires in less than 7 days\n

    Saves this updates in a list that the bot can iterate over to dispatch the information

    :return: List of Tuples(chat_id, update as string)
    """
    session = sessionmaker(bind=engine)()
    # TODO: Make this query better. It is likely to fail when user base grows to big
    users = session.query(dbm.Users).all()

    updates: List[Tuple[int, str]] = []
    logger.info(f"Starting requests for {len(users)} users")
    checked = 0
    errors = 0
    for user in users:

        statement: List[Row] = select(
            dbm.Domains).where(dbm.Domains.chat_id == user.chat_id)
        # extract entry objects from row objects
        domains = [e[0] for e in session.execute(statement).all()]

        for domain in domains:
            # get cert dict
            cert = req_ct.get_cert(domain.domain, domain.port)

            # what if there is no longer a cert?
            # Remove it from db add waring message and continue
            if not cert:
                errors += 1
                message = utl.prep_for_md(
                    f"*ERROR!*\nCan't resolve {utl.mk_link(domain.domain, domain.port)}\n"
                    f'Please check your service _immediately_!',
                    # f'_This domain was removed from your watchlist. You can add it again after it got a new cert._',
                    ignore=['*', '_'])
                updates.append((user.chat_id, message))
                # TODO: this should not happen immediately...
                # session.query(dbm.Domains).filter(dbm.Domains.domain == domain.domain).delete()
                # session.commit()
                logger.warning(
                    f"{domain.domain}:{domain.port} expired, removed it from database"
                )
                continue

            # extract potential new dates - removing timezone information
            new_before = dateparser.parse(
                cert['notBefore']).replace(tzinfo=None)
            new_after = dateparser.parse(cert['notAfter']).replace(tzinfo=None)

            # new_before = datetime.today()
            # check whether something has changed from the expected dates
            if domain.not_before != new_before or domain.not_after != new_after:
                # print("IS NOT EQUAL")
                message = utl.prep_for_md(
                    f"The cert of {utl.mk_link(domain.domain, domain.port)} has changed:\n"
                    f"notBefore: from {domain.not_before.replace(microsecond=0)} to {new_before.replace(microsecond=0)}\n"
                    f"notAfter: {domain.not_after.replace(microsecond=0)} to {new_after.replace(microsecond=0)}"
                )

                # append update message
                updates.append((user.chat_id, message))

                # update database object
                domain.not_before = new_before
                domain.not_after = new_after

            # new_after = datetime.today() - timedelta(2)
            # check whether cert expires in less then a week
            delta = new_after - datetime.today()
            if delta < timedelta(utl.NOTIFY_BEFORE):
                print("EXPIRES!")
                message = utl.prep_for_md(
                    f'The certificate for {utl.mk_link(domain.domain, domain.port)} will expire in:\n'
                    f'*{delta.days} days*\n'
                    f'Expiry: {new_after.replace(microsecond=0)}',
                    ignore=['*'])
                updates.append((user.chat_id, message))

            # update last checked information and commit update
            domain.last_checked = datetime.today()
            session.add(domain)
            session.commit()
            checked += 1
            # sleeping a sec to not look like a ddos attack
            time.sleep(1)

    if errors:
        logger.warning(f"Finished {checked} requests with {errors} errors")
    else:
        logger.info(f"Finished {checked} daily cert requests")
    return updates
Пример #48
0
    def parse_start_url(self, response):
        #self.log('Response for URL "{}", which has flags "{}"'.format(response.url, response.flags))
        if response.url == 'http://magic.wizards.com/en/events/coverage':
            # go through this document to create valid tournaments
            # for url in response.xpath('//a/@href').extract():
            for bloop in response.xpath('//p').extract():
                try:
                    #self.log('This is a "{}"'.format(str(bloop)))
                    pass
                except exceptions.UnicodeEncodeError:
                    pass
                p_match = re.compile(
                    '<p><(strong|b)>([^<]+)</(strong|b)>(.+)</p>$',
                    re.U).match(bloop)
                if p_match:
                    event_type_name = remove_tags(p_match.group(2))
                    stuff = p_match.group(4)
                    lines = stuff.split('<br>')
                    for line in lines:
                        line_re = re.compile(
                            r'href="([^"]+)">(.+)</a> \(([^\)]+)\)([^A-Za-z]+([A-Z].+))?',
                            re.U)
                        line_match = line_re.search(line)
                        if line_match:
                            name = remove_tags(line_match.group(2))
                            fmt = 'Not Supported'
                            try:
                                sys.stderr.write("LINE: '{}'\n".format(line))
                            except exceptions.UnicodeEncodeError:
                                sys.stderr.write(
                                    "I HATE PYTHON UNICODE SUPPORT\n")

                            if line_match.group(5) is not None:
                                for supfmt in [
                                        'Modern', 'Standard', 'Commander',
                                        'Tiny Leaders'
                                ]:
                                    if line_match.group(5).find(supfmt) > -1:
                                        fmt = supfmt
                                        break
                            if event_type_name == 'Grand Prix':
                                name = 'Grand Prix {}'.format(name)
                            if event_type_name == 'Pro Tour':
                                name = 'Pro Tour {}'.format(name)
                            dates_part = line_match.group(3)
                            if dates_part == 'December 2-3, 7, 2014':
                                dates_part = 'December 2-7, 2014'
                            clean_start_date = None
                            clean_end_date = None
                            try:
                                clean_start_date, clean_end_date = rangeparse(
                                    dates_part)
                            except pyparsing.ParseException:
                                pass
                            if clean_start_date is not None and clean_start_date.year > 2010:
                                if clean_end_date is None:
                                    clean_end_date = clean_start_date
                                url = line_match.group(1)
                                if url.find('http') < 0:
                                    url = 'http://magic.wizards.com{}'.format(
                                        url)
                                ti = TournamentItem(
                                    name=name,
                                    url=url,
                                    tournament_format=fmt,
                                    start_date=clean_start_date,
                                    end_date=clean_end_date)
                                yield ti
        else:
            # looking for decks on pages like https://magic.wizards.com/en/events/coverage/2018natus/top-8-decklists-2018-07-01
            self.log("Let's try this...")
            if len(response.selector.xpath('//div[@class="deck-group"]')) > 0:
                # this page has deck listings on it!

                ti = TournamentItem()
                # let's get the event name and URL, if we can.
                breadcrumb_tournament = response.selector.xpath(
                    '//div[@id="breadcrumb"]/span[not(@class="current")][last()]/a'
                )
                if len(breadcrumb_tournament) > 0:
                    self.log("breadcrumb_tournament = {}".format(
                        breadcrumb_tournament))
                    self.log("breadcrumb_tournament len = {}".format(
                        len(breadcrumb_tournament)))
                    ti['name'] = breadcrumb_tournament.xpath(
                        './/text()').extract()[0]
                    ti['url'] = breadcrumb_tournament.xpath(
                        './/@href').extract()[0]

                # and now try to figure out the date
                posted_in = response.selector.xpath(
                    '//p[@class="posted-in"]/text()').extract()
                for val in posted_in:
                    dre_match = DATE_RE.search(val)
                    if dre_match:
                        tdate = dateparser.parse(dre_match.group(1)).date()
                        self.log("date is = {}".format(tdate))
                        ti['start_date'] = tdate
                        ti['end_date'] = tdate
                        break

                # and now the format...
                format_sels = response.selector.xpath(
                    '//div[@id="content-detail-page-of-an-article"]/p/text()')
                ti['tournament_format'] = None
                for format_sel in format_sels:
                    if ti['tournament_format'] is None:
                        val = format_sel.extract()
                        if 'Legacy' in val:
                            ti['tournament_format'] = 'Legacy'
                        if 'Standard' in val:
                            ti['tournament_format'] = 'Standard'
                        if 'Modern' in val:
                            ti['tournament_format'] = 'Modern'

                self.log("TournamentItem is {}".format(ti))

                # BOOKMARK - so, if I think I have a valid TournamentItem, I need to yield it
                page_place = 1
                for deckgroup_selector in response.selector.xpath(
                        '//div[@class="deck-group"]'):
                    self.parse_deckgroup(response, deckgroup_selector,
                                         page_place)
                    page_place += 1
Пример #49
0
def get_cal():
    now = datetime.now()
    print("edtScraping ", now.strftime("%d/%m/%Y %H:%M"))

    url = "http://www.ipst-info.net/consultation/default_stage.aspx?stage=aisl"

    # create a new Firefox session
    #driver = webdriver.Firefox()
    driver = webdriver.Remote(command_executor='http://selenium:4444/wd/hub',
                              desired_capabilities=DesiredCapabilities.FIREFOX)
    driver.implicitly_wait(30)
    driver.get(url)

    cours = []

    #calcul nombre de semaine avant la fin de l'année
    finAnnee = dateparser.parse("30 octobre 2020 17:00")
    today = datetime.now()

    monday1 = (today - timedelta(days=today.weekday()))
    monday2 = (finAnnee - timedelta(days=finAnnee.weekday()))

    numberOfWeekUntilEndOfYear = int(((monday2 - monday1).days / 7) + 1)

    print("Scrap starting ...")
    for i in range(numberOfWeekUntilEndOfYear):
        year = getYear(driver)
        week = getDayOfWeek(driver)
        cours = scrapCours(driver, cours, year, week)

        #click
        python_button = driver.find_element_by_id(
            'Planning_stage1_semaine_suivante')
        python_button.click()
        time.sleep(.300)  # pour que la page est le temps de charger

    driver.quit()

    cal = Calendar()
    cal.add("summary", "Calendrier Cnam I2")
    cal.add('version', '2.0')

    for cour in cours:
        dateDebut = dateparser.parse(cour.dateDebut)
        dateFin = dateparser.parse(cour.dateFin)

        if 'En entreprise' not in cour.matiere and 'nondéfini' not in cour.matiere and 'Férié' not in cour.matiere:
            event = Event()
            event.add('dtstart', dateDebut)
            event.add('dtend', dateFin)

            # si je veux garder les jour entreprise, férié et non défini
            # if 'En entreprise' in cour[2] or  'nondéfini' in cour[2] or 'Férié' in cour[2]:
            #     matiere = cour.matiere
            # else:
            matiere = cour.matiere[:6] + " " + cour.matiere[6:]

            event.add('summary', matiere)
            event.add(
                'description', 'Enseignant : ' + cour.enseignant +
                "\nCommentaire : " + cour.commentaire +
                "\nhttp://www.ipst-info.net/consultation/default_stage.aspx?stage=aisl"
            )
            cal.add_component(event)

    print("Scrap end")

    # sauvegarde du .ics historique
    dateForIcsName = today.strftime('%Y-%m-%d_%H:%M')
    with open('/home/scraper/history/calendarCnamI2' + dateForIcsName + '.ics',
              'wb') as f:
        f.write(cal.to_ical())
        f.close
        print('/home/scraper/history/calendarCnamI2' + dateForIcsName +
              '.ics Saved')

    # ecrasement de l'ancien sauvegarde du nouveau
    with open('/home/scraper/last/calendarCnamI2.ics', 'wb') as f:
        f.write(cal.to_ical())
        f.close
        print("/home/scraper/last/calendarCnamI2.ics Saved")

    return cal
Пример #50
0
# def parse_time(str_time):
# '12:00:00'

import dateparser

f = open('sample.txt')
lines = f.readlines()
time_hash = {}

for line in lines:
    time_status = line.split('::')
    status = (time_status[1]).split('\n')[0]
    # print(status)
    # time = (time_status[0]).split('-')[1]
    # time = time.split(')')[0]
    # # print(time)
    # if status in time_hash:
    #     time_hash[status] -= parse_time(time)
    # else:
    #     time_hash[status] = time
    idx = 1
    time_hash[idx] = dateparser.parse(time_status[0])
    idx += 1

print(time_hash)
Пример #51
0
def test_module(client: IMAPClient) -> str:
    yesterday = parse('1 day UTC')
    client.search(['SINCE', yesterday])
    return 'ok'
def stt_parse_response(stt_data):
    def truncate(n, decimals=0):
        multiplier = 10**decimals
        return int(n * multiplier) / multiplier

    parse_stt_output_response = {
        'fileid': stt_data[1],
        'dlp': stt_data[2],
        'filename': stt_data[3],
        'callid': stt_data[4],
        'date': str(dateparser.parse(stt_data[5])),
        'year': stt_data[6],
        'month': stt_data[7],
        'day': stt_data[8],
        'starttime': stt_data[9],
        'duration': None,
        'speakeronespeaking': None,
        'speakertwospeaking': None,
        'silencesecs': None,
        'silencepercentage': None,
        'nlcategory': None,
        'sentimentscore': None,
        'magnitude': None,
        'transcript': None,
        'words': [],
        'entities': [],
        'sentences': [],
    }
    string_transcript = ''
    total_speaking_time = 0
    total_speaker_one_speaking = 0
    total_speaker_two_speaking = 0
    agent_search_word = stt_data[0]
    speaker_one_tag = 0
    speaker_two_tag = 0

    # get transcript from stt_data
    for i in stt_data[0]['response']['results']:
        if 'transcript' in i['alternatives'][0]:
            string_transcript += str(i['alternatives'][0]['transcript']) + ' '
    parse_stt_output_response[
        'transcript'] = string_transcript[:-1]  # remove the ending whitespace

    # check if the audio file is stereo
    if stt_data[11] == 'true':
        logging.info('Audio file is stereo')
        speaker_one_tag = 1
        speaker_two_tag = 2

    # check if the audio file is mono
    if stt_data[11] == 'false':
        logging.info('Audio file is mono')
        speaker_one_tag = 1
        speaker_two_tag = 2

    # check if the audio file is stereo
    if stt_data[11] == 'true':
        # get words from stt_data and enrich data
        for element in stt_data[0]['response']['results']:
            for word in element['alternatives'][0]['words']:
                total_speaking_time += float(
                    word['endTime'].strip('s')) - float(
                        word['startTime'].strip('s'))
                if element['channelTag'] == speaker_one_tag:
                    total_speaker_one_speaking += float(
                        word['endTime'].strip('s')) - float(
                            word['startTime'].strip('s'))
                    parse_stt_output_response['words'].append({
                        'word':
                        word['word'],
                        'startsecs':
                        word['startTime'].strip('s'),
                        'endsecs':
                        word['endTime'].strip('s'),
                        'speakertag':
                        element['channelTag'],
                        'confidence':
                        word['confidence']
                    })
                if element['channelTag'] == speaker_two_tag:
                    total_speaker_two_speaking += float(
                        word['endTime'].strip('s')) - float(
                            word['startTime'].strip('s'))
                    parse_stt_output_response['words'].append({
                        'word':
                        word['word'],
                        'startsecs':
                        word['startTime'].strip('s'),
                        'endsecs':
                        word['endTime'].strip('s'),
                        'speakertag':
                        element['channelTag'],
                        'confidence':
                        word['confidence']
                    })
        stt_start_time = float(
            stt_data[0]['response']['results'][0]['alternatives'][0]['words']
            [0]['startTime'].strip('s'))
        stt_end_time = float(
            stt_data[0]['response']['results'][-1]['alternatives'][0]['words']
            [-1]['endTime'].strip('s'))
        parse_stt_output_response['silencesecs'] = float(
            stt_data[0]['response']['results'][-1]['alternatives'][0]['words']
            [-1]['endTime'].strip('s')) - total_speaking_time
        parse_stt_output_response['silencepercentage'] = truncate(
            parse_stt_output_response['silencesecs'] /
            float(stt_data[0]['response']['results'][-1]['alternatives'][0]
                  ['words'][-1]['endTime'].strip('s')) * 100)

    # check if the audio file is mono
    if stt_data[11] == 'false':
        # get words from stt_data and enrich data
        for element in stt_data[0]['response']['results'][-1]['alternatives'][
                0]['words']:
            total_speaking_time += float(
                element['endTime'].strip('s')) - float(
                    element['startTime'].strip('s'))
            if element['speakerTag'] == speaker_one_tag:
                total_speaker_one_speaking += float(
                    element['endTime'].strip('s')) - float(
                        element['startTime'].strip('s'))
            if element['speakerTag'] == speaker_two_tag:
                total_speaker_two_speaking += float(
                    element['endTime'].strip('s')) - float(
                        element['startTime'].strip('s'))
            parse_stt_output_response['words'].append({
                'word':
                element['word'],
                'startsecs':
                element['startTime'].strip('s'),
                'endsecs':
                element['endTime'].strip('s'),
                'speakertag':
                element['speakerTag'],
                'confidence':
                element['confidence']
            })
        stt_start_time = float(
            stt_data[0]['response']['results'][-1]['alternatives'][0]['words']
            [0]['startTime'].strip('s'))
        stt_end_time = float(
            stt_data[0]['response']['results'][-1]['alternatives'][0]['words']
            [-1]['endTime'].strip('s'))
        parse_stt_output_response['silencesecs'] = float(
            stt_data[0]['response']['results'][-1]['alternatives'][0]['words']
            [-1]['endTime'].strip('s')) - total_speaking_time
        parse_stt_output_response['silencepercentage'] = truncate(
            parse_stt_output_response['silencesecs'] /
            float(stt_data[0]['response']['results'][-1]['alternatives'][0]
                  ['words'][-1]['endTime'].strip('s')))

    parse_stt_output_response[
        'speakeronespeaking'] = total_speaker_one_speaking
    parse_stt_output_response[
        'speakertwospeaking'] = total_speaker_two_speaking
    parse_stt_output_response['duration'] = stt_start_time + stt_end_time

    # place holder for Google AutoML NLP
    parse_stt_output_response['nlcategory'] = 'NA'

    return parse_stt_output_response
Пример #53
0
def format_date(userdate):
    date = dateparser.parse(userdate)
    try:
        return datetime.datetime.strftime(date, "%Y-%m-%d")
    except TypeError:
        return None
Пример #54
0
def process_game_page(url, season = "2016-2017", season_range=[8,7]):
    years = season.split("-")
    years = [int(year.strip()) for year in years]
    
    match = {}
    
    r2 = requests.get(url)
    soup = BeautifulSoup(r2.text, 'html.parser')
    body = soup.find("body")
    matchhead = body.find("div", {"class": "match-head"})
    
    details = matchhead.find_all("li")
    date_temp = dateparser.parse(f"{details[1].text}")
    if date_temp.month < season_range[0]:
        # Must be in the second year of the season
        year=years[1]
    else:
        year=years[0]
    # Need to manually modify the date as there's no year in the string
    match['date'] = dateparser.parse(f"{details[1].text} {year}")
    
    match['stadium'] = details[2].text.split("\n")[0].strip()    
    match['home'] = {}
    match['away'] = {}
    teamdetails = matchhead.find("div", {"class":"match-head__fixture"}).find_all("div", {"class": "match-head__fixture-side"})
    match['home']['team'] = teamdetails[0].find("a", {"class":"match-head__team-name"}).find("span", {"class": "swap-text__target"}).text
    try:
        match['home']['score'] = int(teamdetails[0].find("span", {"class": "match-head__score"}).text)
        match['home']['scores'] = process_team_details(teamdetails[0])
        match['away']['score'] = int(teamdetails[1].find("span", {"class": "match-head__score"}).text)
        match['away']['scores'] = process_team_details(teamdetails[1])
    except:
        pass
    
    match['away']['team'] = teamdetails[1].find("a", {"class":"match-head__team-name"}).find("span", {"class": "swap-text__target"}).text
    
    

    # Parse the line-ups
    try:
        teamslineup = body.find_all("ul", {"class": "team-lineups__list-group"})
        if len(teamslineup)<4:
            return match
        lineups = {}
        for i, lineup in enumerate(['home', 'away']):
            lineups[lineup] = {}
            players = teamslineup[2*i].find_all("li")
            for player in players:
                try:
                    number = int(player.find("span", {"class": "team-lineups__list-player-number"}).text.strip())

                    name = player.find("span", {"class": "team-lineups__list-player-name"}).text.strip()


                    offs = []
                    ons = []
                    yellows = []
                    reds = []
                    events = player.find_all("span", {"class": "team-lineups__list-events"})
                    for event in events:
                        img = event.find("img")
                        if img:
                            if (img.get("src").split("/")[-1]) == "substitution_off.svg":
                                offs.append(int(event.text.strip().split("'")[0]))
                            elif (img.get("src").split("/")[-1]) == "substitution_on.svg":
                                ons.append(int(event.text.strip().split("'")[0]))
                            elif (img.get("src").split("/")[-1]) == "yellow_card.svg":
                                yellows.append(int(event.text.strip().split("'")[0]))
                            elif (img.get("src").split("/")[-1]) == "red_card.svg":
                                reds.append(int(event.text.strip().split("'")[0]))
                            else:
                                print("Unknown event type: {}".format(img.get("src").split("/")[-1]))
                    if number <= 15:
                        ons.append(0)
                    lineups[lineup][number] = {"name":name, "on": ons, "off": offs, "reds": reds, "yellows": yellows}
                except:
                    continue
        match['home']['lineup'] = lineups['home']
        match['away']['lineup'] = lineups['away']
    except:
        pass
    return match
Пример #55
0
 def __init__(self, title, content, date, kind=None):
     self.title = title
     self.content = content
     self.date = dateparser.parse(date).date()
     self.kind = kind
Пример #56
0
    def parse(self, response):
        review = ReviewItem()
        product = ProductItem()
        product_id = ProductIdItem()
        contents = response.xpath("//div[@class='yt-lockup-content']")
        pic_contents = response.xpath("//div[@class='yt-lockup-dismissable']")
        for content, pic_content in zip(contents, pic_contents):
            # print response.url
            test_url = self.extract(content.xpath(".//a/@href"))
            full_url = get_full_url(response.url, test_url)
            sid = full_url.split('=')[1]
            title = self.extract(content.xpath(".//a/@title"))
            summary = self.extract_all(
                content.xpath(
                    ".//div[@class='yt-lockup-description yt-ui-ellipsis yt-ui-ellipsis-2']//text()"
                ))
            date_str = self.extract(
                content.xpath(
                    ".//ul[@class='yt-lockup-meta-info']/li[1]/text()"))
            date_time = dateparser.parse(date_str)
            review_date = datetime.strftime(date_time, "%Y-%m-%d")
            author = self.extract(
                content.xpath(".//div[@class='yt-lockup-byline']//a//text()"))
            if not author:
                self.go_to_review_page(test_url)
                author = self.get_author(response)
            if not author:
                author = self.extract(
                    content.xpath('//meta[@name="title"]/@content'))
            pic_url = 'https://i.ytimg.com/vi/{}/default.jpg'.format(sid)
            duration_str = self.extract_all(
                pic_content.xpath('.//span[@class="video-time"]//text()'))
            duration = self.calculate_duration(duration_str)
            # product items
            product['source_internal_id'] = sid
            product['ProductName'] = self.get_product_name(title)
            product['TestUrl'] = full_url
            product['PicURL'] = pic_url
            # review items
            review['source_internal_id'] = sid
            review['TestUrl'] = full_url
            review['ProductName'] = self.get_product_name(title)
            review['TestSummary'] = summary
            review['DBaseCategoryName'] = 'vpro'
            review['Author'] = author
            review['TestDateText'] = review_date
            review['TestTitle'] = title
            # we have differnt product_id items, for youtube_id:
            product_id['ProductName'] = self.get_product_name(title)
            product_id['source_internal_id'] = sid
            product_id['ID_kind'] = 'youtube_id'
            product_id['ID_value_orig'] = sid
            product_id['ID_value'] = sid
            yield product_id
            # for duration
            product_id['ProductName'] = self.get_product_name(title)
            product_id['source_internal_id'] = sid
            product_id['ID_kind'] = 'video_duration'
            product_id['ID_value'] = duration
            yield product_id

            yield review
            yield product
Пример #57
0
 def pre_load(self, data):
     data["source"] = s.BINANCE
     data["openTime"] = dateparser.parse(str(data["openTime"])).isoformat()
     data["closeTime"] = dateparser.parse(str(
         data["closeTime"])).isoformat()
     return data
Пример #58
0
    def on_get(self, req, resp):
        # current time
        now = dateparser.parse("now UTC")
        # compute time to each payout
        data = []
        for po in SHARD_DATA:
            payout_time = dateparser.parse(f'today {po["payout"]} UTC')
            delta = payout_time - now
            # Copy the payout data
            row = po.copy()
            row['time_to_payout'] = delta.seconds  # time to payout, in seconds
            data.append(row)
        # sort and format for output
        fields = []
        for po in sorted(data, key=lambda x: x['time_to_payout']):
            hours = po['time_to_payout'] // 3600  # hours to payout
            minutes = (po['time_to_payout'] % 3600) // 60  # minutes to payout
            fields.append({
                'name': f'{hours:02}:{minutes:02} (UTC {po["payout"]})',
                'value': f'{po["emoji"]} [{po["name"]}]({po["swgoh.gg"]})',
                'inline': True
            })
        embed = {
            'description': '**Time until next payout**:',
            'footer': {
                'text': 'Last refresh:',
                'icon_url': 'https://i.imgur.com/OEwutbb.png',
            },
            'thumbnail': {
                'url': 'https://i.imgur.com/OEwutbb.png',
            },
            'timestamp': now.strftime('%Y-%m-%d %H:%M:%S')
        }

        with DiscordAPISession(DISCORD_BASE_URL) as api:
            # First, check for existing messages and delete them
            headers = {
                'Authorization': f'Bot {DISCORD_BOT_TOKEN}',
                'Content-Type': 'application/json'
            }
            messages_resp = api.get(f'channels/{channel_id}/messages',
                                    headers=headers)
            if messages_resp.status_code != 200:
                self.logger.debug(messages_resp.raw)
                resp.status = falcon.HTTP_500
                resp.body = json.dumps(
                    {"error": "Could not get channel messages"})
                return
            messages = messages_resp.json()
            if (msg_count := len(messages)) > 0:
                # First, clear the channel
                if msg_count == 1:
                    # The bulk delete API has a minimum # of message ID's,
                    # so we use a different API to delete just a single message
                    msgid = messages[0]['id']
                    del_resp = api.delete(
                        f'channels/{channel_id}/messages/{msgid}',
                        headers=headers)
                    if del_resp.status_code != 204:
                        self.logger.debug("Error deleting message")
                        self.logger.debug(del_resp.json())
                else:
                    # Use the bulk delete API (max 100 messages per call)
                    while (batch := [msg['id'] for msg in messages[:100]]):
                        self.logger.debug(batch)
                        del_resp = api.post(
                            f'channels/{channel_id}/messages/bulk-delete',
                            headers=headers,
                            json={'messages': batch})
                        if del_resp.status_code != 204:
                            self.logger.debug("Error deleting message")
                            self.logger.debug(del_resp.json())
                        messages = messages[100:]
Пример #59
0
    def get_date(self):
        """
        find references to relative or absolute dates in the text
        also remove the date references from self.tokens
        returns 2d list of start_date, end_date
        """

        today = self.today
        today_weekday = self.today_weekday

        tokens = self.tokens

        parse_past = {'PREFER_DATES_FROM': 'past', 'TIMEZONE': 'US/Eastern'}
        parse_future = {
            'PREFER_DATES_FROM': 'future',
            'TIMEZONE': 'US/Eastern'
        }

        # convert tokens to  [[parsed_date, token]...]
        # the date elements are used for search date range, the tokens are used to remove
        # from them from search tems
        date_tokens = [[dateparser.parse(t, settings=parse_future), t]
                       for t in tokens]
        # remove tokens with no mention of dates
        date_tokens = [d for d in date_tokens if d[0]]

        # check edge cases
        if not date_tokens:
            if 'tonight' in tokens:
                # TODO: add filter for time
                search_date = dateparser.parse('today', settings=parse_past)
                date_tokens = [[search_date, 'tonight']]

            if 'this weekend' in self.bigrams:
                # if you're asking about the weekend while it is the weekend
                if today_weekday in ['friday', 'saturday']:
                    start_date = today

                else:
                    # if it's not yet the weekend
                    start_date = dateparser.parse('friday',
                                                  settings=parse_future)

                end_date = dateparser.parse('sunday', settings=parse_future)
                date_tokens = [[start_date, 'this weekend'], [end_date, '']]

            if 'this week' in self.bigrams:
                start_date = dateparser.parse('today', settings=parse_future)
                end_date = dateparser.parse('saturday', settings=parse_future)
                date_tokens = [[start_date, 'this week'], [end_date, '']]

            # TODO:
            # if 'this month' in self.bigrams:
            #     start_date = dateparser.parse('today', settings=parse_future)
            #     last_day_of_month =
            #     date_tokens = [[start_date, 'this month'], [last_day_of_month, '']]

        search_dates = [d[0] for d in date_tokens]
        date_strings = [d[1] for d in date_tokens]

        self.remove_date_refs_from_tokens(date_strings)

        # set end date time to end of the day
        if len(search_dates) > 1:
            search_dates[1] = search_dates[1] + datetime.timedelta(hours=23,
                                                                   minutes=55)

        return search_dates
Пример #60
0
def find_fixture(
    team,
    was_home=None,
    other_team=None,
    gameweek=None,
    season=CURRENT_SEASON,
    kickoff_time=None,
    dbsession=session,
):
    """Get a fixture given a team and optionally whether the team was at home or away,
    the season, kickoff time and the other team in the fixture. Only returns the fixture
    if exactly one is found that matches the input arguments, otherwise raises a
    ValueError.
    """
    fixture = None

    if not isinstance(team, str):
        team_name = get_team_name(team, season=season, dbsession=dbsession)
    else:
        team_name = team

    if not team_name:
        raise ValueError("No team with id {} in {} season".format(
            team, season))

    if other_team and not isinstance(other_team, str):
        other_team_name = get_team_name(other_team,
                                        season=season,
                                        dbsession=dbsession)
    else:
        other_team_name = other_team

    query = dbsession.query(Fixture).filter_by(season=season)
    if gameweek:
        query = query.filter_by(gameweek=gameweek)
    if was_home is True:
        query = query.filter_by(home_team=team_name)
    elif was_home is False:
        query = query.filter_by(away_team=team_name)
    elif was_home is None:
        query = query.filter(
            or_(Fixture.away_team == team_name,
                Fixture.home_team == team_name))
    else:
        raise ValueError("was_home must be True, False or None")

    if other_team_name:
        if was_home is True:
            query = query.filter_by(away_team=other_team_name)
        elif was_home is False:
            query = query.filter_by(home_team=other_team_name)
        elif was_home is None:
            query = query.filter(
                or_(
                    Fixture.away_team == other_team_name,
                    Fixture.home_team == other_team_name,
                ))

    fixtures = query.all()

    if not fixtures or len(fixtures) == 0:
        raise ValueError(
            ("No fixture with season={}, gw={}, team_name={}, was_home={}, "
             "other_team_name={}, kickoff_time={}").format(
                 season, gameweek, team_name, was_home, other_team_name,
                 kickoff_time))

    if len(fixtures) == 1:
        fixture = fixtures[0]
    elif kickoff_time:
        # team played multiple games in the gameweek, determine the
        # fixture of interest using the kickoff time,
        kickoff_date = dateparser.parse(kickoff_time)
        kickoff_date = kickoff_date.replace(tzinfo=timezone.utc)
        kickoff_date = kickoff_date.date()

        for f in fixtures:
            f_date = dateparser.parse(f.date)
            f_date = f_date.replace(tzinfo=timezone.utc)
            f_date = f_date.date()
            if f_date == kickoff_date:
                fixture = f
                break

    if not fixture:
        raise ValueError((
            "No unique fixture with season={}, gw={}, team_name={}, was_home={}, "
            "kickoff_time={}").format(season, gameweek, team_name, was_home,
                                      kickoff_time))

    return fixture