Exemplo n.º 1
0
 def parse_date(value):
     date_format = value['date_format']
     time_utils = TimeUtils(date_format=date_format)
     date_obj = {**create_time_data(), **value}
     start_timestamp, end_timestamp = time_utils.get_timestamps(date_obj)
     return {
         'start_timestamp': start_timestamp,
         'end_timestamp': end_timestamp
     }
Exemplo n.º 2
0
    def __init__(self, base_url, start_date, end_date, date_format):
        # date_format is the string that specifies the date style of the target website
        self.time_utils = TimeUtils(date_format)
        self.base_url = base_url
        self.start_date = self.time_utils.get_date(start_date)
        self.end_date = self.time_utils.get_date(end_date)

        # \xa0 is returned when a &nbsp is encountered
        # Add any extraneous html data not removed by lxml here
        self.misc_text_to_remove = r'[\xa0]'
Exemplo n.º 3
0
    def __init__(self, base_url, start_date, end_date, date_format, request_date_format = None):
        # date_format is the string that specifies the date style of the target website
        if request_date_format == None:
            request_date_format = date_format

        self.time_utils = TimeUtils(old_date_format=date_format, new_date_format='%m-%d-%Y')
        self.base_url = base_url
        
        request_format_utils = TimeUtils(old_date_format='%m-%d-%Y', new_date_format=request_date_format)
        self.start_date = request_format_utils.get_dates(start_date)[0]
        self.end_date = request_format_utils.get_dates(end_date)[0]
Exemplo n.º 4
0
 def __init__(self, host, apikey, spid):
     super(BasicCrawler, self).__init__()
     self.host = host
     self.apikey = apikey
     self.spid = spid
     #be careful with the table names
     self.dbSaver = NovelSaver(BasicCrawler.getDefaultDBConf(), 'ien_book',
                               'ien_chapter')
     self.lock = thread.allocate_lock()
     #create the book image directory if it doesn't exist
     cwd = os.getcwd()
     self.img_dir = cwd + os.sep + "images" + os.sep + TimeUtils.getDay()
     self.server_img_dir = 'uploads/images/' + TimeUtils.getDay()
     FileUtils.mkdir_p(self.img_dir)
Exemplo n.º 5
0
    def __init__(self,
                 base_url,
                 start_date,
                 end_date,
                 date_format,
                 request_date_format=None):
        # date_format is the string that specifies the date style of the target website
        if request_date_format == None:
            request_date_format = date_format

        self.time_utils = TimeUtils(date_format)
        self.base_url = base_url

        self.update_mutex = Lock()

        request_format_utils = TimeUtils('%m-%d-%Y')
        self.start_date = request_format_utils.convert_date_format(
            start_date, request_date_format)
        self.end_date = request_format_utils.convert_date_format(
            end_date, request_date_format)
        self.start_timestamp = request_format_utils.min_timestamp_for_day(
            start_date)
        self.end_timestamp = request_format_utils.max_timestamp_for_day(
            end_date)

        try:
            self.docker_ip = os.environ['DOCKER_IP']
        except KeyError:
            print(
                'Error: DOCKER_IP not set. If this value was recently set, close all python processes and try again'
            )
Exemplo n.º 6
0
    def __init__(self,
                 organization,
                 base_url,
                 start_date,
                 end_date,
                 date_format,
                 request_date_format=None):
        self.organization = organization
        # date_format is the string that specifies the date style of the target website
        if request_date_format == None:
            request_date_format = date_format

        self.date_format = date_format
        self.time_utils = TimeUtils(date_format)
        self.base_url = base_url
        self.identifier = re.sub(r'\W', '', base_url)
        self.event_manager = EventManager()

        request_format_utils = TimeUtils('%m-%d-%Y')
        # When this is running for multiple days, validating if the date is in the past causes issues
        self.start_date = request_format_utils.convert_date_format(
            start_date, request_date_format, validate_past=False)
        self.end_date = request_format_utils.convert_date_format(
            end_date, request_date_format, validate_past=False)
        self.start_timestamp = request_format_utils.min_timestamp_for_day(
            start_date)
        self.end_timestamp = request_format_utils.max_timestamp_for_day(
            end_date)
Exemplo n.º 7
0
class AggregatorBase:
    # This class includes functionality that should be shared by spiders and API-based classes

    def __init__(self, base_url, start_date, end_date, date_format):
        # date_format is the string that specifies the date style of the target website
        self.time_utils = TimeUtils(date_format)
        self.base_url = base_url
        self.start_date = self.time_utils.get_date(start_date)
        self.end_date = self.time_utils.get_date(end_date)

        # \xa0 is returned when a &nbsp is encountered
        # Add any extraneous html data not removed by lxml here
        self.misc_text_to_remove = r'[\xa0]'

    def remove_html(self, html_data, remove_all = False):
        if isinstance(html_data, list):
            return [self.remove_html(data, remove_all) for data in html_data]

        html_removed = BeautifulSoup(html_data, 'lxml').extract().text
        misc_removed = re.sub(self.misc_text_to_remove, '', html_removed, re.UNICODE)
        if not remove_all:
            return misc_removed
        # Optionally remove all non-word characters and newlines (everything except numbers and letters)
        return re.sub(r'\W+', '', misc_removed , re.UNICODE)

    def pretty_json(self, json_data):
        return json.dumps(json_data, indent = 4)

    def print_json(self, json_data):
        print(self.pretty_json(json_data)) 

    def pretty_html(self, html_data):
        return BeautifulSoup(html_data, 'lxml').prettify()

    def print_html(self, html_data):
        print(self.pretty_html(html_data))

    def write_to_file(self, filename, data):
        if (os.path.isfile(filename)):
            os.remove(filename)

        with open(filename, 'w', encoding = 'utf-8') as f:
            f.writelines(data)    
Exemplo n.º 8
0
class EventTransformPipeline:
    def __init__(self):
        self.time_utils = TimeUtils()

    def process_item(self, item, spider):
        item['organization'] = spider.organization
        if 'event_time' in item:
            item['event_time']['date_format'] = spider.date_format
        loader = EventLoader(**item)
        if 'event_time' in loader.item:
            time = loader.item['event_time']
            if self.time_utils.time_range_is_between(time['start_timestamp'],
                                                     time['end_timestamp'],
                                                     spider.start_timestamp,
                                                     spider.end_timestamp):
                return loader.item
            else:
                raise DropItem('Event is not in the confured timeframe')
        else:
            return loader.item
Exemplo n.º 9
0
    def __init__(self,
                 base_url,
                 start_date,
                 end_date,
                 date_format,
                 request_date_format=None):
        # date_format is the string that specifies the date style of the target website
        if request_date_format == None:
            request_date_format = date_format

        self.time_utils = TimeUtils(date_format)
        self.base_url = base_url
        self.identifier = re.sub(r'\W', '', base_url)
        self.update_mutex = Lock()

        request_format_utils = TimeUtils('%m-%d-%Y')
        self.start_date = request_format_utils.convert_date_format(
            start_date, request_date_format)
        self.end_date = request_format_utils.convert_date_format(
            end_date, request_date_format)
        self.start_timestamp = request_format_utils.min_timestamp_for_day(
            start_date)
        self.end_timestamp = request_format_utils.max_timestamp_for_day(
            end_date)
Exemplo n.º 10
0
class Event(scrapy.Item):
    date = scrapy.Field()
    # setting start_time or end_time will also update time_range when __setitem__ is called and vice versa
    start_time = scrapy.Field()
    end_time = scrapy.Field()
    time_range = scrapy.Field()
    organization = scrapy.Field()
    title = scrapy.Field()
    description = scrapy.Field()
    address = scrapy.Field()
    url = scrapy.Field()
    price = scrapy.Field()
    category = scrapy.Field()

    time_helper = TimeUtils(date_format = '%m-%d-%Y', time_format = '%H:%M')

    @classmethod
    def from_dict(cls, event_dict):
        event = cls()
        for key, value in event_dict.items():
            event[key] = value
        return event

    def get_item_with_default(self, key, default = ''):
        try:
            return super().__getitem__(key)
        except KeyError:
            return default

    def __setitem__(self, key, value):
        # Unfortunately, property decorators don't work with dictionary keys
        # Instead, intercept the __setitem__ call for certain properties and format them before saving
        scrapy_set_item = super().__setitem__
        if key == 'date':
            scrapy_set_item(key, self.time_helper.get_date(value))

        # time_range = start_time - end_time
        # Whenever one property changes, also update the other one(s)
        elif key in ('start_time', 'end_time'):
            scrapy_set_item(key, self.time_helper.get_time(value))
            start = self.get_item_with_default('start_time')
            end = self.get_item_with_default('end_time')
            scrapy_set_item('time_range', self.time_helper.format_start_end(start, end))

        elif key == 'time_range':
            start, end = self.time_helper.get_times(value)
            scrapy_set_item('start_time', start)
            scrapy_set_item('end_time', end)
            scrapy_set_item(key, self.time_helper.format_start_end(start, end))

        elif key == 'url':
            scrapy_set_item(key, value.strip().rstrip('//'))

        else:
            super().__setitem__(key, value)

    def update(self, event):
        for key, value in event.items():
            self[key] = value

    def props_to_csv(self):
        return ','.join(self.keys()) + '\n'

    def vals_to_csv(self):
        return ','.join('"{0}"'.format(str(self[key]).replace('"', '')) for key in self.keys()) + '\n'
Exemplo n.º 11
0
class Event(scrapy.Item):
    # When creating an event, use these values as keys
    # The exception to this is start_timestamp and end_timestamp
    # You can pass those in directly if the data is already formatted
    # as a Unix timestamp, otherwise, pass in the data as defined in the
    # create_time_data function below
    start_timestamp = scrapy.Field()
    end_timestamp = scrapy.Field()
    organization = scrapy.Field()
    title = scrapy.Field()
    description = scrapy.Field()
    address = scrapy.Field()
    url = scrapy.Field()
    price = scrapy.Field()
    category = scrapy.Field()

    time_utils = TimeUtils()

    def set_time_format(self, date_format):
        self.time_utils.date_format = date_format

    @staticmethod
    def create_time_data():
        # When creating an event, you'll want to pass in the data that matches
        # how the data is formatted on the site you're pulling from
        return {
            # Use time if only one time is supplied for the event (not time range)
            'time': None,
            # Use start_time and end_time if the site supplies distinct data for these two values
            'start_time': None,
            'end_time': None,
            # Use time_range if the start and end time is supplied in a single string ex: 6:00-8:00 PM
            'time_range': None,
            # Use date if the event could be one or multiple days but it is contained in a single string
            # This is done this way because some sites have data that could be single days or multiple days
            'date': None,
            # Use start_date and end_date if the site supplies distinct data for these two values
            'start_date': None,
            'end_date': None,
            # Use start_timestamp and end_timestamp if the data is formatted like a Unix timestamp
            'start_timestamp': None,
            'end_timestamp': None
        }

    @classmethod
    def from_dict(cls, event_dict, date_format=''):
        event = cls()
        event.set_time_format(date_format)

        time_data = Event.create_time_data()
        time_data_set = False
        for key, value in event_dict.items():
            value = DataUtils.remove_html(value)
            if key in time_data:
                time_data[key] = value
                time_data_set = True
            else:
                event[key] = value
        if time_data_set:
            event['time_data'] = time_data
        return event

    def to_dict(self):
        return {key: self[key] for key in self.keys()}

    def get_item_with_default(self, key, default=''):
        try:
            return super().__getitem__(key)
        except KeyError:
            return default

    def __setitem__(self, key, value):
        # Unfortunately, property decorators don't work with dictionary keys
        # Instead, intercept the __setitem__ call for certain properties and format them before saving
        scrapy_set_item = super().__setitem__
        if key == 'time_data':
            start_timestamp, end_timestamp = self.time_utils.get_timestamps(
                value)
            scrapy_set_item('start_timestamp', start_timestamp)
            scrapy_set_item('end_timestamp', end_timestamp)

        elif key == 'url':
            scrapy_set_item(key, value.strip().rstrip('//'))

        elif key == 'category':
            # Can't serialze enums to json
            scrapy_set_item(key, value.name)

        else:
            scrapy_set_item(key, value)

    def update(self, event):
        for key, value in event.items():
            self[key] = value

    def props_to_csv(self):
        return ','.join(self.keys()) + '\n'

    def vals_to_csv(self):
        return ','.join('"{0}"'.format(str(self[key]).replace('"', ''))
                        for key in self.keys()) + '\n'
Exemplo n.º 12
0
 def __init__(self):
     self.time_utils = TimeUtils()