Пример #1
0
        def on_body_downloaded(body):
            if 'content-encoding' in headers:
                if headers['content-encoding'] == 'gzip':
                    body = zlib.decompress(body, 16+zlib.MAX_WBITS)
                elif headers['content-encoding'] == 'br':
                    try:
                        try:
                            import brotlicffi as brotli
                        except ImportError:
                            import brotli
                    except ImportError:
                        print('You must run pip install scrapfly-sdk[speedups] - brotli is missing - or disable brotli compression')
                        raise

                    body = brotli.decompress(body)

            response = requests.Response()
            response.status_code = status_code
            response.reason = reason
            response._content = body

            response.headers.update(headers)
            response.url = request.url

            request.scrape_config.raise_on_upstream_error = False

            scrapfly_api_response:ScrapeApiResponse = spider.scrapfly_client._handle_response(
                response=response,
                scrape_config=request.scrape_config
            )

            self._crawler.stats.inc_value('scrapfly/bandwidth_consumed', count=scrapfly_api_response.context['bandwidth_consumed'])

            return ScrapflyScrapyResponse(request=request, scrape_api_response=scrapfly_api_response)
Пример #2
0
def create_new_course(title: str, count: int,  headers: dict) -> str:
    course_title = title + ' ' + str(count)
    url = 'https://iknow.jp/custom/courses'
    course = urllib.parse.quote_plus(course_title)
    payload = 'utf8=%E2%9C%93&goal%5Bname%5D={name}&language={lang}&translation_language={l}&goal%5Bicon_image_url%5D=&commit=Create'.format(name=course, lang='ja', l='en')
    try:
        res = requests.post(url, data=payload, headers=headers)
    except:
        print('Failed to post new course ' + course_title)
        return ''
    res.encoding = 'utf-8'
    if res.status_code != requests.codes.ok:
        # Mark as a word we couldn't add - will process later
        print('Unable to make a new course!!')
        print('Provided title: ' + course_title)
        return ''
    try:
        res_decoded = brotlicffi.decompress(res.content)
    except brotlicffi.Error:
        print('Could not decompress our response from creating a course!')
        return ''
    # The response content is some jquery, which contains the course id
    response_content = res_decoded.decode('utf-8')
    match = re.search(r'/custom/courses/(\d*)', response_content)
    if not match:
        return ''
    else:
        course_id = str(match[1])
        return course_id
def test_streaming_compression_flush(one_compressed_file,
                                     chunk_size,
                                     mode,
                                     quality,
                                     lgwin,
                                     lgblock):
    """
    Confirm that the streaming compressor works as expected, including flushes
    after each chunk.
    """
    compressed_chunks = []
    c = brotlicffi.Compressor(
        mode=mode, quality=quality, lgwin=lgwin, lgblock=lgblock
    )
    with open(one_compressed_file, 'rb') as f:
        while True:
            next_data = f.read(chunk_size)
            if not next_data:
                break

            compressed_chunks.append(c.compress(next_data))
            compressed_chunks.append(c.flush())

    compressed_chunks.append(c.finish())
    decompressed = brotlicffi.decompress(b''.join(compressed_chunks))
    with open(one_compressed_file, 'rb') as f:
        assert decompressed == f.read()
Пример #4
0
def test_roundtrip_compression_with_files(simple_compressed_file):
    """
    Roundtripping data through the compressor works correctly.
    """
    with open(simple_compressed_file[0], 'rb') as f:
        uncompressed_data = f.read()

    assert brotlicffi.decompress(
        brotlicffi.compress(uncompressed_data)) == uncompressed_data
Пример #5
0
def test_decompression(simple_compressed_file):
    """
    Decompressing files returns their original form using decompress.
    """
    with open(simple_compressed_file[0], 'rb') as f:
        uncompressed_data = f.read()

    with open(simple_compressed_file[1], 'rb') as f:
        compressed_data = f.read()

    assert brotlicffi.decompress(compressed_data) == uncompressed_data
Пример #6
0
	def __init__(self, file, checkChecksums=0, fontNumber=-1):
		if not haveBrotli:
			log.error(
				'The WOFF2 decoder requires the Brotli Python extension, available at: '
				'https://github.com/google/brotli')
			raise ImportError("No module named brotli")

		self.file = file

		signature = Tag(self.file.read(4))
		if signature != b"wOF2":
			raise TTLibError("Not a WOFF2 font (bad signature)")

		self.file.seek(0)
		self.DirectoryEntry = WOFF2DirectoryEntry
		data = self.file.read(woff2DirectorySize)
		if len(data) != woff2DirectorySize:
			raise TTLibError('Not a WOFF2 font (not enough data)')
		sstruct.unpack(woff2DirectoryFormat, data, self)

		self.tables = OrderedDict()
		offset = 0
		for i in range(self.numTables):
			entry = self.DirectoryEntry()
			entry.fromFile(self.file)
			tag = Tag(entry.tag)
			self.tables[tag] = entry
			entry.offset = offset
			offset += entry.length

		totalUncompressedSize = offset
		compressedData = self.file.read(self.totalCompressedSize)
		decompressedData = brotli.decompress(compressedData)
		if len(decompressedData) != totalUncompressedSize:
			raise TTLibError(
				'unexpected size for decompressed font data: expected %d, found %d'
				% (totalUncompressedSize, len(decompressedData)))
		self.transformBuffer = BytesIO(decompressedData)

		self.file.seek(0, 2)
		if self.length != self.file.tell():
			raise TTLibError("reported 'length' doesn't match the actual file size")

		self.flavorData = WOFF2FlavorData(self)

		# make empty TTFont to store data while reconstructing tables
		self.ttFont = TTFont(recalcBBoxes=False, recalcTimestamp=False)
Пример #7
0
def create_new_item(course: str, course_id: str, word: dict, headers: dict) -> str:
    add_new_item_url = 'https://iknow.jp/custom/courses/{course_id}/items'.format(course_id=course_id)
    if word['word'] in previously_added or word['word'] in added:
        return ''
    if word['definition'] == BAD_DEF or word['reading'] == BAD_READING:
        # The kindle json couldn't figure these out, let's not add them and move on.
        print('Either bad reading or def for: ' + word['word'])
        fail_to_add_dict = {
            'course': course,
            'course_id': course_id,
            'word': word['word']
        }
        failed_to_add.append(fail_to_add_dict)
        return ''
    cur_word = urllib.parse.quote_plus(word['word'], encoding='utf-8')
    # Don't try to add words we've added in the past
    reading = urllib.parse.quote_plus(word['reading'], encoding='utf-8')
    definition = urllib.parse.quote_plus(word['definition'], encoding='utf-8')
    pos_list = word['part_of_speech'].split(',')
    pos = 'NONE' # Default to none
    # TODO: This chunk doesn't seem to work - part of speech wasn't added for any of my uploads
    for pos in pos_list:
        if pos.lower() in valid_parts_of_speech:
            pos = pos_map.get(pos)
            # Quit on first match
            break
    # implied else is either no PoS given, or can't map to anything. Keep as NONE
    '''
    This is the form iKnow sends:

    item[cue][text]=減点
    item[cue][language]=ja
    item[cue][transliteration]=げんてん
    item[cue][part_of_speech]=N
    item[response][text]=subtracting points
    item[response][language]=en
    '''
    cueString = 'item%5Bcue%5D%5Btext%5D={encodedCue}&item%5Bcue%5D%5Blanguage%5D={cueLang}&item%5Bcue%5D%5Btransliteration%5D={encodedCueTransliteration}&item%5Bcue%5D%5Bpart_of_speech%5D={cuePoS}'.format(encodedCue=cur_word, cueLang='ja', encodedCueTransliteration=reading, cuePoS=pos)
    responseString = '&item%5Bresponse%5D%5Btext%5D={responseText}&item%5Bresponse%5D%5Blanguage%5D={responseLang}'.format(responseText=definition, responseLang='en')
    payload = cueString + responseString
    '''
    Example payload:
item%5Bcue%5D%5Btext%5D=鼻歌&item%5Bcue%5D%5Blanguage%5D=jp&item%5Bcue%5D%5Btransliteration%5D=はなうた&item%5Bcue%5D%5Bpart_of_speech%5D=&item%5Bresponse%5D%5Btext%5D=humming, crooning&item%5Bresponse%5D%5Blanguage%5D=en
    '''
    try:
        res = requests.post(add_new_item_url, data=payload, headers=headers)
    except:
        fail_to_add_dict = {
            'course': course,
            'course_id': course_id,
            'word': word['word']
        }
        failed_to_add.append(fail_to_add_dict)
        print('Failed to post new word ' + word['word'])
        return ''
    # Handler for wierd bug I encountered where res came back as None- maybe just due to forced exit
    if not res:
        fail_to_add_dict = {
            'course': course,
            'course_id': course_id,
            'word': word['word']
        }
        failed_to_add.append(fail_to_add_dict)
        print('Failed to post new word ' + word['word'] + ' - no response')
        return ''
    res.encoding = 'utf-8'
    if res.status_code != requests.codes.ok:
        # Mark as a word we couldn't add
        fail_to_add_dict = {
            'course': course,
            'course_id': course_id,
            'word': word['word']
        }
        failed_to_add.append(fail_to_add_dict)
    else:
        added.add(word['word'])
    try:
        res_decoded = brotlicffi.decompress(res.content)
    except brotlicffi.Error as e:
        print(str(e))
        print('Could not decompress for word: ' + word['word'] + '\'s response')
        print(str(res.content))
        # Don't treat this as a failure to add. Just ensure that we don't try to add a sample sentence
        # and return a blank string
        return ''
    json_res = json.loads(res_decoded)
    # Grab the ID for the new flashcard we just added
    word_id = json_res['id']
    return word_id
Пример #8
0
	def _decompress(self, rawData):
		return brotli.decompress(rawData)
def test_compressed_data_roundtrips(s):
    assert brotlicffi.decompress(brotlicffi.compress(s)) == s
Пример #10
0
def test_decompression_fails_properly_on_garbage(bogus, exception_cls):
    """
    Garbage data properly fails decompression.
    """
    with pytest.raises(exception_cls):
        brotlicffi.decompress(bogus)
Пример #11
0
def unbrotli(data):
    '''Decompresses data for Content-Encoding: br.'''
    return brotli.decompress(data)