def file(self, lib_object: JSON, file: File) -> File:
        cutoff = self.utils.filename_length_cutoff
        if lib_object.get("fileName"):
            filename = lib_object.get("fileName")
        elif lib_object.get("name"):
            extension = mimetypes.guess_extension("application/pdf") or ""
            length = cutoff - len(extension)
            filename = slugify(lib_object.get("name"))[:length] + extension
        else:
            access_url = lib_object["accessUrl"]
            filename = slugify(access_url.split("/")[-1])[-cutoff:]

        file.name = lib_object.get("name", "")
        if len(file.name) > 200:
            file.name = textwrap.wrap(file.name, 199)[0] + "\u2026"

        file.filename = filename
        file.mime_type = lib_object.get("mimeType") or "application/octet-stream"
        file.legal_date = self.utils.parse_date(lib_object.get("date"))
        file.sort_date = (
            self.utils.date_to_datetime(file.legal_date)
            or self.utils.parse_datetime(lib_object.get("created"))
            or timezone.now()
        )
        file.oparl_access_url = lib_object.get("accessUrl")
        file.oparl_download_url = lib_object.get("downloadUrl")
        file.filesize = None
        file.parsed_text = lib_object.get("text")
        file.license = lib_object.get("fileLicense")

        # We current do not handle locations attached to files due
        # to the lack of data and our own location extraction

        return file
Пример #2
0
def build_party_group_geo():
    groups = json.load(open('group_parties.json', 'r'))
    G = nx.Graph()
    for i, group in enumerate(groups):
        parties = groups[group]
        party_list = []
        for party in parties:
            name = slugify.slugify(party['party'])
            party_list.append(name)
            lat = float(party['data']['lat'])
            lng = float(party['data']['lon'])
            lat, lng = scatter(lat, lng)

            G.add_node(name,
                       party_name=party['party'],
                       group_id=i,
                       group=group,
                       group_name=party['data']['group_name'],
                       city=party['data']['city'],
                       lat=lat,
                       lng=lng)

        for i in range(0, len(party_list)):
            for j in range(i + 1, len(party_list)):
                G.add_edge(party_list[i], party_list[j])

    print 'Nodes:', len(G.nodes())
    print 'Edges:', len(G.edges())

    nx.write_gexf(G, './sna/party_group_geo.gexf')
Пример #3
0
def slugify(text, **options):
    # There is a conflict between python-slugify and awesome-slugify
    # So we import from a proper module manually

    # Import
    from slugify.slugify import slugify

    # Slugify
    slug = slugify(text, **options)
    return slug
Пример #4
0
    def process_item(self, item, spider):
        """Extracts and convret date to correspond to
        expected by items.py (Deck, VStats, CardsInDeck)"""

        deck_cleaned = {}

        deck_cleaned["deck_url"] = item["deck_url"]
        price = re.match(self.float_pat, item["price"])
        deck_cleaned["price"] = float(price.group()) if price else None
        metashare = re.match(self.float_pat, item["metashare"])
        deck_cleaned["metashare"] = (float(metashare.group()) /
                                     100 if metashare else None)
        global_performance = re.match(self.float_pat,
                                      item["global_performance"])
        deck_cleaned["global_performance"] = (
            float(global_performance.group()) /
            100 if global_performance else None)
        # extract date from era, break into two, and pop era
        era_begin, era_end = re.findall(self.short_date_pat, item["era"])
        deck_cleaned["era_begin"] = (datetime.strptime(era_begin, "%d %b %Y")
                                     if era_begin else None)
        deck_cleaned["era_end"] = (datetime.strptime(era_end, "%d %b %Y")
                                   if era_end else None)

        # Process each card in the decklist
        cleaned_cards = []
        for card in item["cards_in_deck"]:
            new_card = {}
            new_card["deck_url"] = item["deck_url"]
            new_card["main"] = bool(int(card.pop("data-main")))
            new_card["quantity"] = int(card.pop("data-qt"))
            new_card["card_name"] = card.pop("data-name")
            new_card["card_slug"] = slugify(new_card["card_name"],
                                            separator="_")
            cleaned_cards.append(new_card)
        deck_cleaned["cards_in_deck"] = cleaned_cards

        # Process each versus stat from the deck
        cleaned_vs = []
        for vs in item["vs_stats"]:
            new_vs = {}
            new_vs["deck_url"] = item["deck_url"]
            new_vs["vs_deck_url"] = vs.pop("vs_deck_url")
            matches = re.match(self.int_pat, vs.pop("matches"))
            new_vs["matches"] = int(matches.group()) if matches else 0
            new_vs["performance"] = float(
                re.match(self.float_pat, vs.pop("data-perf")).group())
            cleaned_vs.append(new_vs)
        deck_cleaned["vs_stats"] = cleaned_vs

        logger.debug("DEBUGGING item cleaned:")
        logger.debug(deck_cleaned)
        return deck_cleaned
Пример #5
0
    def file(self, libobject: OParl.File):
        file = self.check_existing(libobject, File, add_defaults=False)
        if not file:
            return
        logging.info("Processing File {}".format(libobject.get_id()))

        if libobject.get_file_name():
            displayed_filename = libobject.get_file_name()
        elif libobject.get_name():
            extension = mimetypes.guess_extension("application/pdf") or ""
            length = self.filename_length_cutoff - len(extension)
            displayed_filename = slugify(
                libobject.get_name())[:length] + extension
        else:
            displayed_filename = slugify(
                libobject.get_access_url())[-self.filename_length_cutoff:]

        file.oparl_id = libobject.get_id()
        file.name = libobject.get_name()[:200]  # FIXME
        file.displayed_filename = displayed_filename
        file.parsed_text = libobject.get_text()
        file.mime_type = libobject.get_mime_type(
        ) or "application/octet-stream"
        file.legal_date = self.glib_datetime_to_python_date(
            libobject.get_date())

        if self.download_files:
            self.download_file(file, libobject)
        else:
            file.storage_filename = ""
            file.filesize = -1

        if file.storage_filename and not file.parsed_text:
            self.extract_text_from_file(file)

        file.save()
        file.rebuild_locations()

        return file
Пример #6
0
def build_party_group_geo():
    groups = json.load(open('group_parties.json', 'r'))
    G = nx.Graph()
    for i, group in enumerate(groups):
        parties = groups[group]
        party_list = []
        for party in parties:
            name = slugify.slugify(party['party'])
            party_list.append(name)
            lat = float(party['data']['lat'])
            lng = float(party['data']['lon'])
            lat, lng = scatter(lat, lng)             
            
            G.add_node(name, party_name = party['party'], group_id = i, group = group, group_name = party['data']['group_name'], city = party['data']['city'], lat = lat, lng = lng)
        
        for i in range(0, len(party_list)):
            for j in range(i+1, len(party_list)):
                G.add_edge(party_list[i], party_list[j])
                
    print 'Nodes:', len(G.nodes())
    print 'Edges:', len(G.edges())
    
    nx.write_gexf(G, './sna/party_group_geo.gexf')
    def file(self, libobject: OParl.File):
        file, do_update = self.check_for_modification(libobject, File)
        if not file or not do_update:
            return file
        self.logger.info("Processing File {}".format(libobject.get_id()))

        if libobject.get_file_name():
            displayed_filename = libobject.get_file_name()
        elif libobject.get_name():
            extension = mimetypes.guess_extension("application/pdf") or ""
            length = self.filename_length_cutoff - len(extension)
            displayed_filename = slugify(
                libobject.get_name())[:length] + extension
        else:
            displayed_filename = slugify(
                libobject.get_access_url())[-self.filename_length_cutoff:]

        parsed_text_before = file.parsed_text
        file_name_before = file.name

        file.oparl_id = libobject.get_id()
        file.name = libobject.get_name()
        file.displayed_filename = displayed_filename
        file.mime_type = libobject.get_mime_type(
        ) or "application/octet-stream"
        file.legal_date = self.glib_datetime_to_python_date(
            libobject.get_date())
        file.sort_date = file.created
        file.oparl_access_url = libobject.get_access_url()
        file.oparl_download_url = libobject.get_download_url()

        # If no text comes from the API, don't overwrite previously extracted PDF-content with an empty string
        if libobject.get_text():
            file.parsed_text = libobject.get_text()

        if self.download_files:
            self.download_file(file, libobject)
        else:
            file.storage_filename = ""
            file.filesize = -1

        parsed_text = file.parsed_text
        if file.storage_filename and not file.parsed_text:
            parsed_text = self.extract_text_from_file(file)

        file = self.call_custom_hook("sanitize_file", file)

        if len(file.name) > 200:
            file.name = textwrap.wrap(file.name, 199)[0] + "\u2026"

        file.save()

        if file_name_before != file.name or parsed_text_before != file.parsed_text:
            # These two operations are rather CPU-intensive, so we only perform them if something relevant has changed
            file.locations = extract_locations(parsed_text)
            file.mentioned_persons = extract_persons(file.name + "\n" +
                                                     (parsed_text or "") +
                                                     "\n")
            file.save()

        return file
Пример #8
0
    def validate_variations(self, asset_type, variations):
        """
        Validate and the given map of variations (if valid the variations are
        returned.
        """

        # Check the structure of the variations is valid
        if not isinstance(variations, dict):
            raise APIError('invalid_request',
                           hint='Request body JSON must be an object.')

        if len(variations) == 0:
            raise APIError('invalid_request',
                           hint='At least one variation is required.')

        elif len(variations) > self.config['MAX_VARIATIONS_PER_REQUEST']:
            raise APIError(
                'invalid_request',
                hint=('The maximum number of variations that can be added in '
                      'single request is '
                      f"{self.config['MAX_VARIATIONS_PER_REQUEST']}."))

        for name, transforms in variations.items():

            # Check the name of the variation is valid
            slug = slugify(
                name,
                regex_pattern=ALLOWED_SLUGIFY_CHARACTERS,
            )

            # Unlike slugify we allow dashes at the start/end of the variation
            # name, so we strip dashes before the test.
            if slug != name.strip('-'):
                raise APIError('invalid_request',
                               hint=f'Not a valid variation name: {name}.')

            # Check the required number of transforms have been provided
            if len(transforms) == 0:
                raise APIError(
                    'invalid_request',
                    hint=('At least one transform per variation is required: '
                          f'{name}.'))

            for i, transform in enumerate(transforms):

                # Check transform structure
                if not (len(transform) == 2 and isinstance(transform[0], str)
                        and isinstance(transform[1], dict)):
                    raise APIError(
                        'invalid_request',
                        hint=(f'Invalid transform structure: {transform} '
                              f'({name}).'))

                # Check the transform exists
                transform_cls = get_transform(asset_type, transform[0])
                if not transform_cls:
                    raise APIError(
                        'invalid_request',
                        hint=(
                            f'Unknown transform: {asset_type}:{transform[0]} '
                            f'({name}).'))

                # Check only the last transform in the list is flagged as a
                # final transform.
                if transform_cls.final and i < len(transforms) - 1:
                    raise APIError(
                        'invalid_request',
                        hint=('Final transform not set as last transform: '
                              f'{asset_type}:{transform[0]} ({name}).'))

                if not transform_cls.final and i == len(transforms) - 1:
                    raise APIError(
                        'invalid_request',
                        hint=(f'Last transform in list is not final: {name}'))

                # Check the settings for the transform are correct
                settings_form = transform_cls.get_settings_form_cls()(
                    MultiDict({
                        k: v
                        for k, v in transform[1].items() if v is not None
                    }))
                if not settings_form.validate():
                    raise APIError(
                        'invalid_request',
                        hint=('Invalid settings for transform: '
                              f'{asset_type}:{transform[0]} ({name}).'),
                        arg_errors=settings_form.errors)

        return variations
Пример #9
0
    async def put(self):
        """Store the uploaded file as an asset"""

        # Make sure a file was received
        files = self.request.files.get('file')
        if not files:
            raise APIError(
                'invalid_request',
                arg_errors={'file': ['No file received.']}
            )
        file = files[0]

        # Validate the arguments
        form = PutForm(to_multi_dict(self.request.body_arguments))
        if not form.validate():
            raise APIError(
                'invalid_request',
                arg_errors=form.errors
            )

        if self.config['ANTI_VIRUS_ENABLED']:

            # Check the file for viruses
            av_client = clamd.ClamdUnixSocket(
                self.config['ANTI_VIRUS_CLAMD_PATH']
            )
            av_scan_result = av_client.instream(io.BytesIO(file.body))

            if av_scan_result['stream'][0] == 'FOUND':
                raise APIError(
                    'invalid_request',
                    arg_errors={
                        'file': ['File appears to be a virus.']
                    }
                )

        form_data = form.data

        # Create a name for the asset
        fname, fext = os.path.splitext(file.filename)
        name = slugify(
            form_data['name'] or fname,
            regex_pattern=ALLOWED_SLUGIFY_CHARACTERS,
            max_length=200
        )

        # Determine the files extension
        ext = fext[1:] if fext else imghdr.what(file.filename, file.body)

        # Determine the asset type/content type for the image
        content_type = mimetypes.guess_type(f'f.{ext}')[0] \
                or 'application/octet-stream'
        asset_type = self.config['CONTENT_TYPE_TO_TYPES'].get(
            content_type,
            'file'
        )

        # Build the meta data for the asset
        meta = {
            'filename': file.filename,
            'length': len(file.body)
        }

        if asset_type == 'audio':
            try:
                au_file = io.BytesIO(file.body)
                au_file.name = file.filename
                au = mutagen.File(au_file)

            except:
                raise APIError(
                    'invalid_request',
                    arg_errors={
                        'file': ['Unable to open the file as an audio file.']
                    }
                )

            if au is not None:
                meta['audio'] = {
                    'channels': getattr(au.info, 'channels', -1),
                    'length': getattr(au.info, 'length', -1),
                    'mode': {
                        0: 'stereo',
                        1: 'joint_stereo',
                        2: 'dual_channel',
                        3: 'mono'
                    }.get(getattr(au.info, 'mode', ''), ''),
                    'sample_rate': getattr(au.info, 'sample_rate', -1)
                }

        if asset_type == 'image':

            im = None

            try:
                im = Image.open(io.BytesIO(file.body))
                meta['image'] = {
                    'mode': im.mode,
                    'size': im.size
                }

            except:
                raise APIError(
                    'invalid_request',
                    arg_errors={
                        'file': ['Unable to open the file as an image.']
                    }
                )

            finally:
                if im:
                    im.close()

        # Create the asset
        asset = Asset(
            uid=Asset.generate_uid(),
            account=self.account,
            secure=form_data['secure'],
            name=name,
            ext=ext,
            type=asset_type,
            content_type=content_type,
            expires=(time.time() + form_data['expire'])
                    if form_data['expire'] else None,
            meta=meta
        )

        # Store the file
        backend = self.get_backend(asset.secure)

        await backend.async_store(
            io.BytesIO(file.body),
            asset.store_key,
            loop=asyncio.get_event_loop()
        )

        # Save the asset
        asset.insert()

        # Update the asset stats
        Stats.inc(
            self.account,
            today_tz(tz=self.config['TIMEZONE']),
            {
                'assets': 1,
                'length': asset.meta['length']
            }
        )

        self.write(asset.to_json_type())
Пример #10
0
 def on_changed_name(target, value, oldvalue, initiator):
     target.slug = slugify(value)
Пример #11
0
 def on_changed_title(target, value, oldvalue, initiator):
     if value and (not target.slug or value != oldvalue):
         target.slug = slugify(value)