Exemplo n.º 1
0
    def request(self):
        try:
            with Database() as session:
                with Browser() as browser_session:
                    url = self.url(Category.NEWSLETTER.value)
                    response = browser_session.get(url)
                    response.raise_for_status()
                    soup = bs4.BeautifulSoup(response.content, 'html.parser')

                    elements = soup.select('#content-left a')
                    LOGGER.info('{} newsletters have been found'.format(
                        len(elements)))

                    if len(elements) == 0:
                        LOGGER.info('Done, no more newsletters')
                        return

                    for element in reversed(elements):
                        href = element['href']
                        title = element.text

                        model, created = session.get_or_create(Newsletter,
                                                               url=href,
                                                               title=title)
                        if not created:
                            LOGGER.info(
                                f'Newsletter "{href}" ...skipped (duplicate)')
                        else:
                            LOGGER.info(f'{href} ...added')
        except requests.exceptions.HTTPError as exc:
            LOGGER.warning('Scraping {} {} ...skipping'.format(
                url, exc.response.status_code))
            raise exc
Exemplo n.º 2
0
 def unexpected_tag(self, tag, tagName):
     length = self.read_uint16()
     LOGGER.log(
         CustomLoggingLevel.EXTRA_DATA,
         '[0x%x] tag %s(%s) appears unexpected, length: %d.' %
         (self.fileObject.cur() - 2, tagName, tag.encode('hex'), length))
     self.fileObject.read(length - 2)
Exemplo n.º 3
0
    def read_scandata(self):
        curPos = self.fileObject.cur()
        self.scanDataPos = curPos
        LOGGER.log(CustomLoggingLevel.IMAGE_DEBUG, 'Start to read scan data.')

        # read all data to improve process speed
        tmpdata = self.fileObject.read(self.fileObject.size - curPos)
        index = 0
        while self.scanFlag == True:
            if tmpdata[index] == '\xff':
                if tmpdata[index + 1] == '\xd9':
                    self.tag_eoi('\xff\xd9')
                else:
                    self.scanData.append(tmpdata[index])
                    self.scanData.append(tmpdata[index + 1])
                index += 2
            else:
                self.scanData.append(tmpdata[index])
                index += 1
        if index < len(tmpdata):
            self.showextradata(tmpdata[index:], curPos + index)
        self.scanDataLength = len(self.scanData)
        LOGGER.log(
            CustomLoggingLevel.IMAGE_INFO,
            'Scan data start at 0x%x, length: 0x%x.' %
            (curPos, self.scanDataLength))
Exemplo n.º 4
0
    def request(self):
        with Database() as session:
            with Browser() as browser_session:
                url = self.url(Category.INVOICE.value)
                loop = True
                while loop:
                    try:
                        response = browser_session.get(url)
                        response.raise_for_status()
                        soup = bs4.BeautifulSoup(response.content,
                                                 'html.parser')

                        elements = soup.select(
                            '#dokumenty table.tabulka tr:not(.hlavicka)')

                        if len(elements) == 0:
                            LOGGER.info('Done, no more invoices')
                            return

                        for element in elements:
                            published, _, title, _, _, _, document = element.findChildren(
                                'td')
                            link = document.findChild('a').attrs.get('href')
                            size_in_mb = re.search(r'([0-9\.]+)',
                                                   document.text).groups()[0]
                            is_pdf = re.search(r'\.pdf$', url)
                            if is_pdf:
                                model, created = session.get_or_create(
                                    Invoice,
                                    published=datetime.date.fromisoformat(
                                        published.text),
                                    title=title.text,
                                    url=link,
                                    size_in_mb=size_in_mb)
                                if not created:
                                    LOGGER.info(
                                        f'Invoice {model.url} ...skipped (duplicate)'
                                    )
                                    loop = False
                                else:
                                    LOGGER.info(f'{model.url} ...added')
                            else:
                                LOGGER.warning(
                                    f'Invoice {model.url} ...skipped (not PDF)'
                                )
                        next_url = soup.select_one(
                            '#dokumenty table:first-of-type [align="right"] a:nth-last-child(2)'
                        ).attrs.get('href')
                        next_url = urljoin(self.base_url(response.url),
                                           next_url)

                        # FIXME: first page can be w/o the page number
                        if next_url == url:
                            return
                        url = next_url
                    except requests.exceptions.HTTPError as exc:
                        LOGGER.warning('Scraping {} {} ...skipping'.format(
                            url, exc.response.status_code))
                        raise exc
Exemplo n.º 5
0
 def showextradata(self, data, location):
     if len(data) > 128:
         tmpFileObject = FileObject(data)
         LOGGER.log(CustomLoggingLevel.EXTRA_DATA,
                    '[0x%x] %s' % (location, tmpFileObject.type()))
     else:
         LOGGER.log(CustomLoggingLevel.EXTRA_DATA,
                    '[0x%x] > %s' % (location, data))
Exemplo n.º 6
0
 def tag_dri(self, tag):
     # 0xFFDD Define Restart Interval
     length = self.read_uint16() - 2
     curPos = '[0x%x]' % self.fileObject.cur()
     self.restartInterval = self.read_uint16()
     if length != 2:
         LOGGER.log(CustomLoggingLevel.EXTRA_DATA,
                    '%s> %s' % (curPos, self.fileObject.read(length - 2)))
     return self.find_tag('DRI')
Exemplo n.º 7
0
 def clean_bitstream_remainder(self):
     remainder = self.streamBuffer[0] & myBitStreamMaskR[
         8 - self.bitStreamStart]
     if remainder != 0 and remainder != myBitStreamMaskR[
             8 - self.bitStreamStart]:
         LOGGER.log(
             CustomLoggingLevel.EXTRA_DATA,
             '?0x%x? Unsual end of bitstream, is %s. (0x%s)' %
             (self.scanDataIndex, bin(remainder), self.streamBuffer[0]))
     self.streamBuffer.remove(self.streamBuffer[0])
     self.bitStreamStart = 0
Exemplo n.º 8
0
 def tag_app1(self, tag):
     backCurPos = self.fileObject.cur()
     length = self.read_uint16()
     magic = self.fileObject.read(6)
     if magic != 'Exif\x00\x00':
         LOGGER.warning('[0x%x] Unbale to process magic %s in APP1.' %
                        (self.fileObject.cur(), magic))
         self.fileObject.read(length - 8)
         return self.find_tag('APP1')
     self.read_tiff(length - 8, 'Exif')
     self.fileObject.change_cur(backCurPos + length)
     return self.find_tag('APP1')
Exemplo n.º 9
0
 def tag_sof(self, tag):
     # 0xFFC1~0xFFC7 0xFFC9~0xFFCF Start Of Frame
     length = self.read_uint16()
     self.encodeType = 'sofx'
     self.bitsPerPixel = self.fileObject.read_uint8()
     self.height = self.read_uint16()
     self.width = self.read_uint16()
     if self.fileObject.read(1) != '\x03':
         LOGGER.error('[0x%x] Color type must be YCrCb(0x03) in JFIF.' %
                      self.fileObject.cur())
     comp = self.fileObject.read(9)
     return self.find_tag('SOFx')
Exemplo n.º 10
0
 def find_tag(self, tagName):
     if self.fileObject.read(1) != '\xFF':
         curPos = '[0x%x]' % self.fileObject.cur()
         LOGGER.error('%s Can\'t find 0xFF in end of %s.' %
                      (curPos, tagName))
         data = []
         d = self.fileObject.read(1)
         while d != '\xFF':
             data.append(d)
             d = self.fileObject.read(1)
         LOGGER.log(CustomLoggingLevel.EXTRA_DATA,
                    '%s> %s' % (curPos, ''.join(data)))
     return '\xff' + self.fileObject.read(1)
Exemplo n.º 11
0
 def tag_app0(self, tag):
     # 0xFFE0 APP0
     length = self.read_uint16()
     magic = self.fileObject.read(5)
     if magic != 'JFIF\x00':
         LOGGER.warning('[0x%x] Unbale to process magic %s in APP0.' %
                        (self.fileObject.cur(), magic))
     self.version = self.read_uint16()
     self.fileObject.read(5)
     self.thumbnailX = self.fileObject.read_uint8()
     self.thumbnailY = self.fileObject.read_uint8()
     self.thumbnail = self.fileObject.read(length - 16)  # RGB pixel
     return self.find_tag('APP0')
Exemplo n.º 12
0
 def read_tiff(self, length, tagName):
     tiffStartPos = self.fileObject.cur()
     if self.fileObject.read(2) == 'II':
         p_read_uint16 = self.fileObject.read_uint16
         p_read_uint32 = self.fileObject.read_uint32
     else:
         p_read_uint16 = self.read_uint16
         p_read_uint32 = self.read_uint32
     if p_read_uint16() != 0x2a:
         LOGGER.warning('[0x%x] TIFF data format magic check failed.' %
                        tiffStartPos)
     dirEntryPos = p_read_uint32()
     self.read_tiff_ifd(tiffStartPos, p_read_uint16, p_read_uint32,
                        dirEntryPos, tagName)
Exemplo n.º 13
0
 def tag_app(self, tag):
     # 0xFFE1~0xFFEE Application-specific
     appID = (ord(tag[0]) << 8) + ord(tag[1]) - 0xFFE0
     length = self.read_uint16() - 2
     data = self.fileObject.read(length)
     if not appID in [1, 2, 13, 14]:
         LOGGER.log(
             CustomLoggingLevel.OTHER_DATA, '[0x%x] Tag APP%d found.' %
             (self.fileObject.cur() - length, appID))
     else:
         LOGGER.log(
             CustomLoggingLevel.OTHER_DATA,
             '[0x%x] Tag APP%d found, this tag usually not used in file.' %
             (self.fileObject.cur() - length, appID))
     return self.find_tag('APP%d' % appID)
Exemplo n.º 14
0
 def start(self):
     if self.fileObject.read(2) == '\xff\xd8':  # start of JPEG file
         tag = self.fileObject.read(2)
         while self.scanFlag == False and tag != None:
             try:
                 tag = self.tagMap[tag](tag)
             except KeyError:
                 tag = self.tag_unknown(tag)
         LOGGER.log(
             CustomLoggingLevel.IMAGE_INFO,
             'JPEG (ver %d.%d): %d*%dpx , channel: %d, fileLength: 0x%x b.'
             % (self.version >> 8, self.version & 0xff, self.width,
                self.height, self.channel, self.fileObject.size))
     else:
         LOGGER.error('JPEG file start mark 0xFFD8 check failed.')
Exemplo n.º 15
0
    def read_tiff_ifd(self, tiffStartPos, p_read_uint16, p_read_uint32,
                      dirEntryPos, tagName):
        dirCount = 0
        while dirEntryPos != 0:
            entryCount = p_read_uint16(tiffStartPos + dirEntryPos)
            LOGGER.log(
                CustomLoggingLevel.IMAGE_DEBUG,
                '[%s] Tiff data start at 0x%x, directory index: %d, start at: 0x%x, entry count: %d.'
                % (tagName, tiffStartPos, dirCount, dirEntryPos, entryCount))
            for i in range(entryCount):
                try:
                    dirTag = p_read_uint16(tiffStartPos + dirEntryPos + 2 +
                                           12 * i)
                    dataFormat = p_read_uint16()
                    nComponent = p_read_uint32()
                    dataLength = nComponent * tiffEnumDataTypeLength[dataFormat]
                    if dataLength > 4:
                        dataStartPos = p_read_uint32()
                        data = self.fileObject.read(
                            dataLength, tiffStartPos + dataStartPos)
                    else:
                        data = self.fileObject.read(4)

                    if dirTag == 0x8769:
                        self.read_tiff_ifd(
                            tiffStartPos, p_read_uint16, p_read_uint32,
                            p_read_uint32(tiffStartPos + dirEntryPos + 10 +
                                          12 * i), 'SubExif')
                    elif dirTag == 0xa005:
                        self.read_tiff_ifd(
                            tiffStartPos, p_read_uint16, p_read_uint32,
                            p_read_uint32(tiffStartPos + dirEntryPos + 10 +
                                          12 * i), 'ExifInteroperability')

                    if dataFormat == 2:
                        LOGGER.log(
                            CustomLoggingLevel.IMAGE_INFO,
                            '[%s - %s](string)> %s' %
                            (tagName, exifEnumTag[dirTag],
                             data.replace('\x00', '')))
                    else:
                        LOGGER.log(
                            CustomLoggingLevel.IMAGE_INFO,
                            '[%s - %s](%s)> Hex:%s' %
                            (tagName, exifEnumTag[dirTag],
                             tiffEnumDataType[dataFormat], data.encode('hex')))
                except KeyError or IndexError:
                    LOGGER.warning(
                        '[0x%x] Unable to decode dataformat or entrytag in tiff data, tagName: %s, dirTag: 0x%x, dataFormat: 0x%x, directory: %d/%d.'
                        % (self.fileObject.cur(), tagName, dirTag, dataFormat,
                           i, entryCount))
            dirCount += 1
            dirEntryPos = p_read_uint32(tiffStartPos + dirEntryPos + 2 +
                                        12 * entryCount)
Exemplo n.º 16
0
 def tag_sos(self, tag):
     # 0xFFDA Start Of Scan
     self.scanFlag = True
     length = self.read_uint16() - 2
     if self.fileObject.read(1) != '\x03':
         LOGGER.error('[0x%x] Color type must be YCrCb(0x03) in JFIF.' %
                      self.fileObject.cur())
     comp = self.fileObject.read(3)
     for i in range(3):
         self.scanQuantization[i] = {
             'DC': ord(comp[i]) >> 4,
             'AC': ord(comp[i]) & 0xf
         }
     self.scanSs = self.fileObject.read(1)
     self.scanSe = self.fileObject.read(1)
     self.scanAh = ord(self.fileObject.read(1))
     self.scanAl = self.scanAh & 0xf
     self.scanAh = self.scanAh >> 4
     self.fileObject.read(3)
Exemplo n.º 17
0
 def tag_dht(self, tag):
     # 0xFFC4 Define Huffman Table(s)
     length = self.read_uint16() - 2
     while length > 0:
         tableIDByte = self.fileObject.read_uint8()
         if tableIDByte >> 4 == 0:
             tableID = tableIDByte & 0xf
         else:
             tableID = 2 + tableIDByte & 0xf
         if tableID < 4:
             length -= self.huffmantree_decode(tableID) + 1
         else:
             LOGGER.log(
                 CustomLoggingLevel.EXTRA_DATA,
                 '[0x%x] Unknown part of huffman table' %
                 (self.fileObject.cur() - 1))
             self.fileObject.read(length)  # skip unknown part
             break
     return self.find_tag('DHT')
Exemplo n.º 18
0
 def tag_sof0(self, tag):
     # 0xFFC0 Start Of Frame
     length = self.read_uint16()
     self.encodeType = 'sof0'
     self.dctTransform = self.fileObject.read_uint8()
     self.bitsPerPixel = 8
     self.height = self.read_uint16()
     self.width = self.read_uint16()
     if self.fileObject.read(1) != '\x03':
         LOGGER.error('[0x%x] Color type must be YCrCb(0x03) in JFIF.' %
                      self.fileObject.cur())
     comp = self.fileObject.read(9)
     for i in range(3):
         self.colorQuantization[ord(comp[3 * i])] = {
             'Horz': ord(comp[3 * i + 1]) >> 4,
             'Vert': ord(comp[3 * i + 1]) & 0xf,
             'TableID': ord(comp[3 * i + 2])
         }
     return self.find_tag('SOF0')
Exemplo n.º 19
0
    def __init__(self, max_retries: int = 5):
        LOGGER.info("Creating browser session")
        self.session = Session()

        LOGGER.info("Injecting headers into the browser")
        self.session.headers.update({
            "User-Agent":
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:81.0) Gecko/20100101 Firefox/88.0",
            "Accept-Language":
            "sk,en-US;q=0.7,en;q=0.3",
        })

        if max_retries:
            retry_strategy = Retry(total=max_retries)

            adapter = HTTPAdapter(max_retries=retry_strategy)
            self.session.mount("https://", adapter)
            self.session.mount("http://", adapter)

        super().__init__()
Exemplo n.º 20
0
def cli():
    return_code = 0
    LOGGER.info('Start')

    MeetingPlugin().run()
    AnnouncementPlugin().run()
    NewsletterPlugin().run()
    BudgetPlugin().run()
    ProcurementPlugin().run()
    ReportPlugin().run()
    OrderPlugin().run()
    InvoicePlugin().run()
    ContractPlugin().run()
    TablePlugin().run()
    ResolutionPlugin().run()
    TranscriptPlugin().run()
    VZNPlugin().run()

    LOGGER.info('Done')
    return return_code
Exemplo n.º 21
0
def asc_detect(filename, min_length=5):
    LOGGER.log(CustomLoggingLevel.OTHER_DATA, "--- ascii detect start --- ")

    def is_readable(c):
        readable_chars = "abcdefghijklmnopqrstuvwxyz" + \
                         "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + \
                         "0123456789" + \
                         "`~!@#$%^&*()_+[]\{}|;':\",./<>" + \
                         " "+'\n'+'\t'+'\r'

        return c < 128 and chr(c) in readable_chars

    # LOGGER.addHandler(logging.StreamHandler())
    file_object = FileObject(filename)
    pre = -1
    data = ""
    for i in xrange(file_object.size):
        byte = file_object.read_uint8()
        if not is_readable(byte):
            length = i - pre - 1
            pre = i
            if length >= min_length:
                LOGGER.log(CustomLoggingLevel.ASCII_DATA,
                           "[ascii] at pos 0x%x:\n" % i + data)
            data = ""
        else:
            data += chr(byte)
    LOGGER.log(CustomLoggingLevel.OTHER_DATA, "--- ascii detect finished --- ")
Exemplo n.º 22
0
    def get_images(self):
        result = []
        for image in self.images:
            # print len(image["data"])

            color_table = self.globalColorTable
            if "localColorTableFlag" in image and image[
                    "localColorTableFlag"] == 1:
                color_table = image["localColorTable"]
            data = self.lzw_decode(image["data"], image["LZWMinimumCodeSize"])
            w = image["width"]
            h = image["height"]
            cur = Image()
            cur.w = w
            cur.h = h
            cur.data = [color_table[i] for i in data]
            result.append(cur)
            if len(cur.data) != cur.w * cur.h:
                LOGGER.log(
                    CustomLoggingLevel.OTHER_DATA,
                    "image %d has wrong width or height " % len(self.result))
        return result
Exemplo n.º 23
0
    def request(self):
        try:
            with Database() as session:
                with Browser() as browser_session:
                    url = self.url(Category.BUDGET.value)
                    response = browser_session.get(url)
                    response.raise_for_status()
                    soup = bs4.BeautifulSoup(response.content, 'html.parser')

                    elements = soup.select('#content-left a')
                    LOGGER.info('{} budgets have been found'.format(
                        len(elements)))

                    if len(elements) == 0:
                        LOGGER.info('Done, no more budgets')
                        return

                    for element in reversed(elements):
                        href = element['href']
                        title = element.text

                        if not re.search(r'\d', title):
                            sufix = element.findPreviousSibling('h2').text
                            title = f'{title} {sufix}'

                        model, created = session.get_or_create(Budget,
                                                               url=href,
                                                               title=title)
                        if not created:
                            LOGGER.info(
                                f'Budget "{href}" ...skipped (duplicate)')
                        else:
                            LOGGER.info(f'{href} ...added')
        except requests.exceptions.HTTPError as exc:
            LOGGER.warning('Scraping {} {} ...skipping'.format(
                url, exc.response.status_code))
            raise exc
Exemplo n.º 24
0
    def request(self):
        try:
            with Database() as session:
                with Browser() as browser_session:
                    url = self.url(Category.REPORT.value)
                    response = browser_session.get(url)
                    response.raise_for_status()
                    soup = bs4.BeautifulSoup(response.content, 'html.parser')

                    elements = soup.select('#content-left tr')
                    LOGGER.info('{} reports have been found'.format(
                        len(elements)))

                    if len(elements) == 0:
                        LOGGER.info('Done, no more reports')
                        return

                    for element in elements:
                        date, title = element.findChildren('td')

                        title = re.sub(r'[\n\s]+', ' ', title.text)

                        model, created = session.get_or_create(
                            Report,
                            date=date.text.strip(),
                            title=title.strip())
                        if not created:
                            LOGGER.info(
                                f'Report "{model.title[:30]}..." ...skipped (duplicate)'
                            )
                        else:
                            LOGGER.info(f'"{model.title[:40]}..." ...added')
        except requests.exceptions.HTTPError as exc:
            LOGGER.warning('Scraping {} {} ...skipping'.format(
                url, exc.response.status_code))
            raise exc
Exemplo n.º 25
0
plugin_folder = project_path('commands')


class CLI(click.MultiCommand):
    def list_commands(self, ctx):
        rv = []
        for filename in os.listdir(plugin_folder):
            if filename.endswith('.py'):
                rv.append(filename[:-3])
        rv.sort()
        return rv

    def get_command(self, ctx, name):
        ns = {}
        fn = os.path.join(plugin_folder, name + '.py')
        with open(fn) as f:
            code = compile(f.read(), fn, 'exec')
            eval(code, ns, ns)
        return ns['cli']


cli = CLI()

if __name__ == '__main__':
    try:
        LOGGER.info(' '.join(sys.argv))
        sys.exit(cli(standalone_mode=False))
    except Exception as e:
        LOGGER.exception(e)
Exemplo n.º 26
0
 def after_request(self):
     LOGGER.info('Finished scrapping invoices')
Exemplo n.º 27
0
 def before_request(self):
     LOGGER.info('Start scrapping invoices')
Exemplo n.º 28
0
 def after_request(self):
     LOGGER.info('Finished scrapping newsletter')
Exemplo n.º 29
0
 def before_request(self):
     LOGGER.info('Start scrapping newsletter')
Exemplo n.º 30
0
 def _log_post_request(self, response):
     LOGGER.info(f'Response status: {response.status}')
     LOGGER.info(f'Response json: {response.json}')
     return response