Пример #1
0
 def search_parse(self, shortname, text, settings):
     translated, original = self.search(shortname, text, settings)
     bad_translate_with_search = ['vi', 'hu']   # splitting done by spaces and some dictionary items contain spaces
     if shortname not in bad_translate_with_search:
         parser = DateDataParser(languages=['en'], settings=settings)
         parsed, substrings = self.parse_found_objects(parser=parser, to_parse=translated,
                                                       original=original, translated=translated, settings=settings)
     else:
         parser = DateDataParser(languages=[shortname], settings=settings)
         parsed, substrings = self.parse_found_objects(parser=parser, to_parse=original,
                                                       original=original, translated=translated, settings=settings)
     parser._settings = Settings()
     return list(zip(substrings, [i['date_obj'] for i in parsed]))
    def given_parser(self, settings=None):
        def collecting_get_date_data(get_date_data):
            @wraps(get_date_data)
            def wrapped(*args, **kwargs):
                self.freshness_result = get_date_data(*args, **kwargs)
                return self.freshness_result

            return wrapped

        self.add_patch(
            patch.object(
                freshness_date_parser, 'get_date_data',
                collecting_get_date_data(freshness_date_parser.get_date_data)))

        self.freshness_parser = Mock(wraps=freshness_date_parser)
        self.add_patch(patch.object(self.freshness_parser, 'now', self.now))

        dt_mock = Mock(wraps=dateparser.freshness_date_parser.datetime)
        dt_mock.utcnow = Mock(return_value=self.now)
        self.add_patch(
            patch('dateparser.freshness_date_parser.datetime', new=dt_mock))
        self.add_patch(
            patch('dateparser.date.freshness_date_parser',
                  new=self.freshness_parser))
        self.parser = DateDataParser(settings=settings)
Пример #3
0
 def __init__(self, config=None):
     self.active_reminder = {}
     self.regex = r'\[(.*)\]'
     self.settings = {'PREFER_DATES_FROM': 'future', 'DATE_ORDER': 'DMY'}
     self.parser = DateDataParser(languages=['en'],
                                  allow_redetect_language=False,
                                  settings=self.settings)
Пример #4
0
 def parse_url(self, response):
     date = response.xpath("//span[@class='highwire-cite-metadata-date highwire-cite-metadata']//text()").extract_first()
     release_date = DateDataParser().get_date_data(date)['date_obj'].strftime("%Y-%m-%d")
     for article in response.xpath("//a[@class='highwire-cite-linked-title']"):
         url = urlparse.urljoin(response.url, article.xpath("./@href").extract_first())
         yield {
             "url" : url,
             "release_date" : release_date
         }
Пример #5
0
 def __call__(self, values):
     values = super(Date, self).__call__(values)
     dates = []
     for text in values:
         if isinstance(text, (dict, list)):
             dates.append(text)
         try:
             date = DateDataParser().get_date_data(text)['date_obj']
             dates.append(date.strftime(self.format))
         except ValueError:
             pass
     return dates
Пример #6
0
    def given_parser(self):
        def collecting_get_date_data(parse):
            @wraps(parse)
            def wrapped(date_string):
                self.date_result = parse(date_string)
                return self.date_result
            return wrapped
        self.add_patch(patch.object(date_parser,
                                    'parse',
                                    collecting_get_date_data(date_parser.parse)))

        self.date_parser = Mock(wraps=date_parser)
        self.add_patch(patch('dateparser.date.date_parser', new=self.date_parser))
        self.parser = DateDataParser()
Пример #7
0
 def __call__(self, values):
     values = super(Date, self).__call__(values)
     dates = []
     for text in values:
         if isinstance(text, (dict, list)):
             dates.append(text)
         try:
             date = DateDataParser(settings={
                 'PREFER_DAY_OF_MONTH': 'first'
             }).get_date_data(text)['date_obj']
             dates.append(date.strftime(self.format))
         except ValueError:
             pass
         except AttributeError:
             pass
     return dates
    def given_parser(self):
        self.add_patch(patch.object(freshness_date_parser, 'now', self.now))

        def collecting_get_date_data(get_date_data):
            @wraps(get_date_data)
            def wrapped(date_string):
                self.freshness_result = get_date_data(date_string)
                return self.freshness_result
            return wrapped
        self.add_patch(patch.object(freshness_date_parser,
                                    'get_date_data',
                                    collecting_get_date_data(freshness_date_parser.get_date_data)))

        self.freshness_parser = Mock(wraps=freshness_date_parser)
        self.add_patch(patch('dateparser.date.freshness_date_parser', new=self.freshness_parser))
        self.parser = DateDataParser()
 def __init__(self, config):
     super(RelevancePeriodExtractor, self).__init__(config)
     timeliness_params = self.config['timeliness']
     self.extract_period = timeliness_params.get('extract_period', False)
     self.timeliness_strategy = timeliness_params.get('timeliness_strategy', [])
     self.date_order = timeliness_params.get('date_order', 'DMY')
     self.max_empty_relevance_period = timeliness_params.get('max_empty_relevance_period', 10)
     if not self.timeliness_strategy:
         raise ValueError('You need to provide values for "timeliness_strategy."')
     datapackage_check = DataPackageChecker(self.config)
     datapackage_check.check_database_completeness([self.source_file])
     settings = {'RETURN_AS_TIMEZONE_AWARE': False,
                 'PREFER_DAY_OF_MONTH': 'last',
                 'PREFER_DATES_FROM': 'past',
                 'SKIP_TOKENS': ['to'],
                 'DATE_ORDER': self.date_order}
     self.date_parser = DateDataParser(allow_redetect_language=True,
                                       settings=settings)
def parse_html(html):
    """Parse data from string containing HTML.

    Returns a DataFrame.
    """

    soup = BeautifulSoup(html, 'html.parser')
    df = find_data(soup)

    # append original date column
    original_week_period = find_week_date(soup)
    df['original_week_period'] = original_week_period

    # extract date value from text
    date_search = re.search('(\d+\s+\w+\s+\d{2,4})$', original_week_period,
                            re.IGNORECASE)
    if date_search:
        original_date_text = date_search.group(1)
        df['original_date_text'] = original_date_text
    else:
        raise Exception(
            f"Couldn't extract date from date text {original_week_period}.")

    # parse date
    ddp = DateDataParser(languages=['tr'], settings={'DATE_ORDER': 'DMY'})
    df['date'] = ddp.get_date_data(original_date_text).date_obj

    # parse numeric 'vaka sayısı' figure using TR locale
    locale.setlocale(locale.LC_NUMERIC, 'tr_TR')
    df['data-detay'] = df['data-detay'].apply(locale.atof)

    # remove dash from column names
    df = df.rename(columns={
        'data-adi': 'data_adi',
        'data-detay': 'data_detay'
    })

    return df[[
        'data_adi', 'data_detay', 'original_week_period', 'original_date_text',
        'date'
    ]]
    def _get_date_delta(self, date_val: str):
        field_ref = self._get_field_ref("tweak")
        if field_ref:
            tweak, _ = fpe_base.cleanup_value(field_ref.value, field_ref.radix)
            tweak = str(tweak).zfill(16)
            tweak_val = self._fpe_ff1.encrypt(tweak.encode(), field_ref.radix)
        else:
            tweak = "0000000000000000"
            tweak_val = self._fpe_ff1.encrypt(tweak.encode())

        tweak_val = self._fpe_ff1.decode(tweak_val)
        days = int(tweak_val) % self.range + self.lower_range_days
        _date_val = None
        if self.format:
            try:
                _date_val = datetime.strptime(date_val, self.format).date()
            except ValueError:
                pass
        if not _date_val:
            _date_val = DateDataParser(settings={
                "STRICT_PARSING": True
            }).get_date_data(date_val)
            _date_val = _date_val["date_obj"].date()
        return days, _date_val
Пример #12
0
 def __init__(self, languages = ['en']):
     self.parser = DateDataParser(languages=languages)
Пример #13
0
class VeritasScraper(scrapy.Spider):
    name = 'veritas'
    allowed_domains = ['veritasprep.com', 'gmat.veritasprep.com']
    start_urls = ['https://www.veritasprep.com/login/']
    ddp = DateDataParser()

    def __init__(self, username, password, **kwargs):
        super(VeritasScraper, self).__init__(**kwargs)
        self.username = username
        self.password = password

    def parse(self, response):
        return [
            FormRequest.from_response(response,
                                      formdata={
                                          'username': self.username,
                                          'password': self.password
                                      },
                                      formnumber=1,
                                      callback=self.after_login)
        ]

    def after_login(self, response):
        # check login succeed before going on
        if "Your username or password does not exist." in response.body:
            self.log("Login failed")
            return

        return Request(
            url="http://gmat.veritasprep.com/question-bank/practices",
            callback=self.check_paging)

    def check_paging(self, response):
        # if there's a pager on the page, yield requests for each page, or just yield the first page if not
        pager_links = response.xpath(
            """//*[@id="primary"]/div[1]/div/ul/li/a[contains(@href, "page=")]"""
        )

        if len(pager_links) > 0:
            pages = [
                int(pnum) for pnum in set(
                    reduce(lambda x, y: x + y, [
                        re.findall(r"\?page=([0-9]+)", x.root.attrib['href'])
                        for x in pager_links if 'href' in x.root.attrib
                    ]))
            ]

            for page in pages:
                self.logger.info("about to scrape page %d..." % page)
                yield Request(url=(
                    "http://gmat.veritasprep.com/question-bank/practices?page=%d"
                    % page),
                              callback=self.parse_practices)
        else:
            # there will always be a page 1
            yield Request(url=(
                "http://gmat.veritasprep.com/question-bank/practices?page=%d" %
                1),
                          callback=self.parse_practices)

    def parse_practices(self, response):
        # body > div.container > div.page-body > table > tbody
        # practices = response.xpath('/html/body/div[2]/div[3]/table/tbody/tr')
        practices = response.xpath('//*[@id="primary"]/table/tbody/tr')

        for row in practices:
            cells = [
                x.strip() for x in row.css('td::text').extract()
                if x.strip() != ''
            ]
            self.log("Cells: %s" % str(cells))

            if 'Not finished' in cells[2]:
                continue

            r = PracticeSession()
            r['student'] = self.username

            # attempt to see if the date in parentheses is more specific
            # than the month-day specifier (e.g. 'hours ago'), and use it if so.
            # otherwise, just use the month-day specifier
            try:
                inner_date = row.css('td:first-child small::text').extract()[0]
                inner_date_parsed = VeritasScraper.ddp.get_date_data(
                    inner_date)
                r['taken_on'] = inner_date_parsed['date_obj'] \
                    if inner_date_parsed and ("hour" in inner_date or "minute" in inner_date) \
                    else parse_datetime(cells[0])
            except IndexError:
                r['taken_on'] = parse_datetime(cells[0])

            r['question_count'] = int(cells[1])
            r['percent_correct'] = cells[2]
            r['duration'] = cells[3]

            try:
                r['site_practice_id'] = int(
                    row.xpath("*/a/@href[contains(., 'practices')]").extract()
                    [0].split('/')[-1])
            except:
                # i know it's too broad, but i just want to ignore it if i can't extract it
                r['site_practice_id'] = None

            yield r
Пример #14
0
 def _parse_date(self, string):
     parser = DateDataParser()
     date = parser.get_date_data(string)['date_obj']
     if date is None:
         raise RuntimeError('Unable to parse date: {!r}'.format(string))
     return date.date()
Пример #15
0
from dateparser.date import DateDataParser

ddp = DateDataParser()


def get_date(date_string):
    result = ddp.get_date_data(date_string)

    return result.get('date_obj')
Пример #16
0
 def adapt(self, text, htmlpage=None):
     try:
         return DateDataParser().get_date_data(text)['date_obj']
     except ValueError:
         return
Пример #17
0
 def when_date_is_parsed_using_with_datedataparser(self, dt_string):
     ddp = DateDataParser(detect_languages_function=detect_languages)
     self.result = ddp.get_date_data(dt_string)["date_obj"]
Пример #18
0
 def __init__(self, host, user, password):
     self.jira = JIRA(host, basic_auth=(user, password), max_retries=1)
     self.ddp = DateDataParser(languages=['pt', 'en'])
Пример #19
0
    def annotate(self, doc):
        # If no date is associated with the document, the document's date will
        # be treated as the most recent date explicitly mentioned in the
        # the document.
        detect_date = doc.date is None
        doc_date = doc.date or datetime.datetime.now()
        strict_parser = DateDataParser(['en'],
                                       settings={'STRICT_PARSING': True})

        def date_to_datetime_range(text,
                                   relative_base=None,
                                   prefer_dates_from='past'):
            if relative_base is None:
                relative_base = doc_date
            # Handle relative date ranges like "the past ___ days"
            relative_num_days = re.sub(relative_duration_range_re, "", text)
            if len(relative_num_days) < len(text):
                num_days_datetime_range = date_to_datetime_range(
                    relative_num_days)
                if not num_days_datetime_range:
                    return None
                return [num_days_datetime_range[0], relative_base]
            text = clean_date_str(text)
            if len(text) < 3:
                return None
            # Handle ordinal dates like "the second month of 2006"
            match = ordinal_date_re.match(text)
            if match:
                match_dict = match.groupdict()
                if match_dict['ordinal']:
                    ordinal_number = ORDINALS.index(match_dict['ordinal']) + 1
                else:
                    ordinal_number = int(match_dict['ordinal_number'])
                unit = match_dict['unit']
                rest = match_dict['rest']
                if unit == 'day':
                    return date_to_datetime_range(
                        str(ordinal_number) + " " + rest)
                elif unit == 'week':
                    if ordinal_number > 4:
                        return
                    parsed_remainder = date_to_datetime_range("1 " + rest)
                    if not parsed_remainder:
                        return
                    week_start = parsed_remainder[0]
                    week_start = date_to_datetime_range(
                        "Sunday",
                        # A day is added because if the base date is on Sunday
                        # the prior sunday will be used.
                        relative_base=week_start + relativedelta(days=1))[0]
                    for _ in range(ordinal_number - 1):
                        week_start = date_to_datetime_range(
                            "Sunday",
                            relative_base=week_start + relativedelta(days=1),
                            prefer_dates_from='future')[0]
                    return [week_start, week_start + relativedelta(days=7)]
                elif unit == 'month':
                    month_name = datetime.datetime(2000, ordinal_number,
                                                   1).strftime("%B ")
                    return date_to_datetime_range(month_name + rest)
                else:
                    raise Exception("Unknown time unit: " + unit)
            # handle dates like "1950s" since dateparser doesn't
            decade_match = re.match(r"(\d{4})s", text)
            if decade_match:
                decade = int(decade_match.groups()[0])
                return [
                    datetime.datetime(decade, 1, 1),
                    datetime.datetime(decade + 10, 1, 1)
                ]
            parser = DateDataParser(
                ['en'],
                settings={
                    'RELATIVE_BASE': relative_base or datetime.datetime.now(),
                    'PREFER_DATES_FROM': prefer_dates_from
                })
            try:
                text = re.sub(r" year$", "", text)
                date_data = parser.get_date_data(text)
            except (TypeError, ValueError):
                return
            if date_data['date_obj']:
                date = date_data['date_obj']
                if date_data['period'] == 'day':
                    return [date, date + relativedelta(days=1)]
                elif date_data['period'] == 'month':
                    date = datetime.datetime(date.year, date.month, 1)
                    return [date, date + relativedelta(months=1)]
                elif date_data['period'] == 'year':
                    date = datetime.datetime(date.year, 1, 1)
                    return [date, date + relativedelta(years=1)]

        def parse_non_relative_date(text):
            result = date_to_datetime_range(text,
                                            relative_base=datetime.datetime(
                                                900, 1, 1))
            if result and result[0].year > 1000:
                # If the year is less than 1000 assume the year 900
                # base date was used when parsing so the date is relative.
                return result[0]

        if 'structured_data' not in doc.tiers:
            doc.add_tiers(StructuredDataAnnotator())
        if 'spacy.nes' not in doc.tiers:
            doc.add_tiers(SpacyAnnotator())
        # Create a combine tier of nes and regex dates
        date_span_tier = doc.tiers['spacy.nes'].with_label('DATE')
        # Regex for formatted dates
        regex = re.compile(
            r"\b("
            # parenthetical year
            r"((?<=[\[\(])[1-2]\d{3}(?=[\]\)]))|"
            # date MonthName yyyy
            r"(\d{1,2} [a-zA-Z]{3,} \[?\d{4})|"
            # dd-mm-yyyy
            r"(\d{1,2} ?[\/\-] ?\d{1,2} ?[\/\-] ?\d{1,4})|"
            # yyyy-MMM-dd
            r"(\d{1,4} ?[\/\-] ?[a-z]{3,4} ?[\/\-] ?\d{1,4})|"
            # yyyy-mm-dd
            r"(\d{1,4} ?[\/\-] ?\d{1,2} ?[\/\-] ?\d{1,2})"
            r")\b",
            re.I)
        match_tier = doc.create_regex_tier(regex)
        date_span_tier += match_tier
        # Add year components individually incase the full spans are thrown out.
        # Sometimes extra text is added to dates that makes them invalid,
        # this allows some of the date to be recovered.
        date_span_tier += date_span_tier.match_subspans(r"([1-2]\d{3})")
        # Remove spans that are probably ages.
        date_span_tier = date_span_tier.without_overlaps(
            date_span_tier.match_subspans(r"\bage\b"))
        # Group adjacent date info in case it is parsed as separate chunks.
        # ex: Friday, October 7th 2010.
        adjacent_date_spans = date_span_tier.combined_adjacent_spans(
            max_dist=9)
        grouped_date_spans = []

        def can_combine(text):
            if re.match(r"\d{4}", text, re.I):
                # year only date
                return True
            try:
                return strict_parser.get_date_data(text)['date_obj'] is None
            except (TypeError, ValueError):
                return True

        for date_group in adjacent_date_spans:
            date_group_spans = list(date_group.iterate_leaf_base_spans())
            if any(can_combine(span.text) for span in date_group_spans):
                if date_to_datetime_range(date_group.text) is not None:
                    grouped_date_spans.append(date_group)
        # Find date ranges by looking for joiner words between dates.
        date_range_joiners = [
            t_span for t_span in doc.tiers['spacy.tokens']
            if re.match(r"(" + DATE_RANGE_JOINERS +
                        r"|\-)$", t_span.text, re.I)
        ]
        date_range_tier = date_span_tier.label_spans('start')\
            .with_following_spans_from(date_range_joiners, max_dist=3)\
            .with_following_spans_from(date_span_tier.label_spans('end'), max_dist=3)\
            .label_spans('date_range')
        since_tokens = AnnoTier([
            t_span for t_span in doc.tiers['spacy.tokens']
            if 'since' == t_span.token.lemma_
        ],
                                presorted=True).label_spans('since_token')
        since_date_tier = (
            since_tokens.with_following_spans_from(date_span_tier,
                                                   allow_overlap=True) +
            date_span_tier.with_contained_spans_from(since_tokens)
        ).label_spans('since_date')
        tier_spans = []
        all_date_spans = AnnoTier(date_range_tier.spans + grouped_date_spans +
                                  date_span_tier.spans + since_date_tier.spans)

        if detect_date:
            simple_date_spans = AnnoTier(
                grouped_date_spans +
                date_span_tier.spans).optimal_span_set(prefer='text_length')
            latest_date = None
            for span in simple_date_spans:
                if re.match(r"today|yesterday", span.text, re.I):
                    continue
                try:
                    span_date = strict_parser.get_date_data(
                        span.text)['date_obj']
                except (TypeError, ValueError):
                    continue
                if span_date and span_date < datetime.datetime.now():
                    if not latest_date or span_date > latest_date:
                        latest_date = span_date
            if latest_date:
                doc_date = latest_date

        date_spans_without_structured_data = all_date_spans.without_overlaps(
            doc.tiers['structured_data'])
        date_spans_in_structured_data = []
        dates_by_structured_value = doc.tiers['structured_data.values']\
            .group_spans_by_containing_span(all_date_spans, allow_partial_containment=False)
        for value_span, date_spans in dates_by_structured_value:
            date_spans_in_structured_data += date_spans
        all_date_spans = AnnoTier(date_spans_without_structured_data.spans +
                                  date_spans_in_structured_data
                                  ).optimal_span_set(prefer='text_length')
        for date_span in all_date_spans:
            # Parse the span text into one or two components depending on
            # whether it contains multiple dates for specifying a range.
            if date_span.label == 'date_range':
                range_component_dict = date_span.groupdict()
                range_components = [
                    range_component_dict['start'][0].text,
                    range_component_dict['end'][0].text
                ]
            else:
                range_components = re.split(
                    r"\b(?:" + DATE_RANGE_JOINERS + r")\b", date_span.text,
                    re.I)
                if len(range_components) == 1:
                    hyphenated_components = date_span.text.split("-")
                    if len(hyphenated_components) == 2:
                        range_components = hyphenated_components
                    elif len(hyphenated_components) == 6:
                        # Handle dote ranges like 2015-11-3 - 2015-11-6
                        range_components = [
                            '-'.join(hyphenated_components[:3]),
                            '-'.join(hyphenated_components[3:])
                        ]
            if ends_with_timeunit_re.match(
                    date_span.text) and not relative_duration_range_re.match(
                        date_span.text):
                # Prevent durations like "5 days" from being parsed as specific
                # dates like "5 days ago"
                continue
            elif len(range_components) == 1:
                if date_span.label == 'since_date':
                    date_str = [
                        span for span in date_span.base_spans[0].base_spans
                        if span.label != 'since_token'
                    ][0].text
                    datetime_range = date_to_datetime_range(date_str)
                    if datetime_range is None:
                        continue
                    datetime_range = [datetime_range[0], doc_date]
                else:
                    date_str = range_components[0]
                    datetime_range = date_to_datetime_range(date_str)
                    if datetime_range is None:
                        continue
            elif len(range_components) == 2:
                # Handle partial years (e.g.: 2001-12)
                if re.match(r"\d{1,2}$", range_components[1]):
                    if re.match(r".*\d{1,2}$", range_components[0]):
                        characters_to_sub = "1"
                        if len(range_components[1]) > 1:
                            characters_to_sub = "1,2"
                        range_components[1] = re.sub(
                            r"\d{" + characters_to_sub + "}$",
                            range_components[1], range_components[0])
                # Check for a non-relative date in the range that can be used as
                # a relative base date the other date.
                # Example: March 3 to November 2 1984
                non_relative_dates = [
                    parse_non_relative_date(text) for text in range_components
                ]
                relative_base_date = next((x for x in non_relative_dates if x),
                                          doc_date)
                datetime_range_a = date_to_datetime_range(
                    range_components[0], relative_base=relative_base_date)
                datetime_range_b = date_to_datetime_range(
                    range_components[1], relative_base=relative_base_date)
                if datetime_range_a is None and datetime_range_b is None:
                    continue
                elif datetime_range_a is None:
                    datetime_range = datetime_range_b
                elif datetime_range_b is None:
                    datetime_range = datetime_range_a
                else:
                    # If include_end_date is False treat the span's daterange
                    # as ending at the start of the second date component unless
                    # a word like "through" is used in the second component.
                    if self.include_end_date or\
                       re.search(r"\bthrough\b", date_span.text) or\
                       re.search(r"\b(late|end of)\b", range_components[1]):
                        datetime_range = [
                            datetime_range_a[0], datetime_range_b[1]
                        ]
                    else:
                        datetime_range = [
                            datetime_range_a[0], datetime_range_b[0]
                        ]
            else:
                print("Bad date range split:", date_span.text,
                      range_components)
                continue
            # Omit reverse ranges because they usually come from something
            # being incorrectly parsed. The main exception is relative dates
            # like 2 to 3 weeks ago.
            if datetime_range[0] <= datetime_range[1]:
                tier_spans.append(DateSpan(date_span, datetime_range))
        return {
            'dates': AnnoTier(tier_spans, presorted=True),
            # Include unparsable and non-specific dates
            'dates.all': all_date_spans
        }
Пример #20
0
# -*- coding: utf-8 -*-
import scrapy
import html2text
from dateparser.date import DateDataParser
import dateparser
import logging

dparser = DateDataParser(languages=['en'], try_previous_locales=False)

# Set html2text configuration
html2text.config.IGNORE_ANCHORS = True
html2text.config.IGNORE_IMAGES = True
html2text.config.IGNORE_EMPHASIS = True
html2text.config.BODY_WIDTH = 0


class EastAfrican(scrapy.Spider):
    """
    Spider for the local news site EastAfrican. Works the same for

    Business Daily (africa) --> contextsIds=539444 -->  74196 artikelen
    The citizen (Tanzania) --> contextsIds=1765046 --> 71839 artikelen
    Daily Nation kenya --> contextsIds=1148 --> 474712  artikelen
    The east african Kenya -->contextsIds=2456  52513 artikelen (vanaf deze site werken)
    Daily Monitor Uganda --> contextsIds=691150 --> 174375 artikelen
    """

    name = "Kenya_EastAfrican_spider"
    download_delay = 2

    def start_requests(self):
Пример #21
0
 def date_to_datetime_range(text,
                            relative_base=None,
                            prefer_dates_from='past'):
     if relative_base is None:
         relative_base = doc_date
     # Handle relative date ranges like "the past ___ days"
     relative_num_days = re.sub(relative_duration_range_re, "", text)
     if len(relative_num_days) < len(text):
         num_days_datetime_range = date_to_datetime_range(
             relative_num_days)
         if not num_days_datetime_range:
             return None
         return [num_days_datetime_range[0], relative_base]
     text = clean_date_str(text)
     if len(text) < 3:
         return None
     # Handle ordinal dates like "the second month of 2006"
     match = ordinal_date_re.match(text)
     if match:
         match_dict = match.groupdict()
         if match_dict['ordinal']:
             ordinal_number = ORDINALS.index(match_dict['ordinal']) + 1
         else:
             ordinal_number = int(match_dict['ordinal_number'])
         unit = match_dict['unit']
         rest = match_dict['rest']
         if unit == 'day':
             return date_to_datetime_range(
                 str(ordinal_number) + " " + rest)
         elif unit == 'week':
             if ordinal_number > 4:
                 return
             parsed_remainder = date_to_datetime_range("1 " + rest)
             if not parsed_remainder:
                 return
             week_start = parsed_remainder[0]
             week_start = date_to_datetime_range(
                 "Sunday",
                 # A day is added because if the base date is on Sunday
                 # the prior sunday will be used.
                 relative_base=week_start + relativedelta(days=1))[0]
             for _ in range(ordinal_number - 1):
                 week_start = date_to_datetime_range(
                     "Sunday",
                     relative_base=week_start + relativedelta(days=1),
                     prefer_dates_from='future')[0]
             return [week_start, week_start + relativedelta(days=7)]
         elif unit == 'month':
             month_name = datetime.datetime(2000, ordinal_number,
                                            1).strftime("%B ")
             return date_to_datetime_range(month_name + rest)
         else:
             raise Exception("Unknown time unit: " + unit)
     # handle dates like "1950s" since dateparser doesn't
     decade_match = re.match(r"(\d{4})s", text)
     if decade_match:
         decade = int(decade_match.groups()[0])
         return [
             datetime.datetime(decade, 1, 1),
             datetime.datetime(decade + 10, 1, 1)
         ]
     parser = DateDataParser(
         ['en'],
         settings={
             'RELATIVE_BASE': relative_base or datetime.datetime.now(),
             'PREFER_DATES_FROM': prefer_dates_from
         })
     try:
         text = re.sub(r" year$", "", text)
         date_data = parser.get_date_data(text)
     except (TypeError, ValueError):
         return
     if date_data['date_obj']:
         date = date_data['date_obj']
         if date_data['period'] == 'day':
             return [date, date + relativedelta(days=1)]
         elif date_data['period'] == 'month':
             date = datetime.datetime(date.year, date.month, 1)
             return [date, date + relativedelta(months=1)]
         elif date_data['period'] == 'year':
             date = datetime.datetime(date.year, 1, 1)
             return [date, date + relativedelta(years=1)]
Пример #22
0
 def _parser_get_date(self, date_string, date_formats, languages):
     parser = DateDataParser(languages)
     return parser.get_date_data(date_string, date_formats)
Пример #23
0
def get_transaction_list(page=1, save=False):
    """ Get the list of transaction on the bscscan.com/tokentxns

    :param page:
    :param save:
    :return:
    """

    path = "./bsc-txns/"
    headers = {
        'authority': 'bscscan.com',
        'cache-control': 'max-age=0',
        'upgrade-insecure-requests': '1',
        'origin': 'https://bscscan.com',
        'content-type': 'application/x-www-form-urlencoded',
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/89.0.4389.72 Safari/537.36',
        'accept':
        'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
        'sec-gpc': '1',
        'sec-fetch-site': 'same-origin',
        'sec-fetch-mode': 'navigate',
        'sec-fetch-user': '******',
        'sec-fetch-dest': 'document',
        'referer': 'https://bscscan.com/tokentxns',
        'accept-language': 'en-US,en;q=0.9',
    }
    data = {
        '__EVENTTARGET': 'ctl00$ContentPlaceHolder1$ddlRecordsPerPage',
        '__EVENTARGUMENT': '',
        '__LASTFOCUS': '',
        '__VIEWSTATE':
        'tyaWVZtVIcox53PAgl8Cg7o4rS646MXzyP0MBL24NBXx/igPkQwAgUalGPJ/kcAaFUULU/TFjWJp66Dh2dyRl/lMLEP5UuVosqryUatP7+A=',
        '__VIEWSTATEGENERATOR': 'CBF7936C',
        '__EVENTVALIDATION':
        'cej1e9PiZXQnlScJdUIUjpTw0bIsJamvVpvKJ7ZFAZ1uANm06lSzLVsz0Chy9zEqejUFjYNxHWMsb86MBc7aMVZ836Kd1/uRB3S87lrsxszHSDwpuN997C7prJA1AEAuBSmBrSvExpsscrjglOaQDAqK7Zer5pd+kuxPjm7voI1Hj2rBWbK4Fd9ZpwsKCZ1T+z9CpAn4raYBh4woFm7rgQ==',
        'ctl00$ContentPlaceHolder1$ddlRecordsPerPage': '100'
    }

    response = requests.post(f'https://bscscan.com/tokentxns?ps=100&p={page}',
                             headers=headers,
                             data=data)
    df = pd.read_html(response.text)[0]
    df.columns = [
        "view", "tx_hash", "age", "from", "icon", "to", "value", "token"
    ]
    df = df[["tx_hash", "age", "from", "to", "value", "token"]]
    df["token_symbol"] = df["token"].apply(
        lambda x: re.search(r'\((.*?)\)', x).group(1))
    ddp = DateDataParser(languages=['en'])
    df["age"] = df["age"].apply(lambda x: pd.to_datetime(
        ddp.get_date_data(str(x)).__dict__["date_obj"] - timedelta(hours=1)))
    print(f"Found record {len(df)}")

    if save:
        timestamp = int(time.time())
        df.to_json(f"{path}{timestamp}.json", orient="records")

    sleep_time = random.randint(1, 4)
    print(f"Sleeping for {sleep_time} seconds")
    time.sleep(sleep_time)
    return df