Пример #1
0
    def handle(self, *args, **kwargs):
        for club in Club.objects.all():
            if club.description:
                # extract testimonials by using the string "From the members of <club>:"
                # the description is before this string and the testimonials are after this string
                match = re.match(r"(.*)From members of ([^:]+)[:;](.*)",
                                 club.description, re.M | re.I | re.S)
                if match is not None:
                    desc, _, testimonials = match.groups()

                    # remove testimonials from description
                    club.description = clean(desc)
                    club.save()

                    # save testimonials
                    count = 0
                    testimonials = bleach.clean(testimonials,
                                                strip=True,
                                                tags=[])
                    # find text enclosed in quotes followed by a newline
                    # except for the last quote, which is at the end of the string
                    for testimonial in re.findall(r'"(.*?)"(?:\r?\n|$)',
                                                  testimonials,
                                                  re.M | re.I | re.S):
                        text = testimonial.strip()
                        Testimonial.objects.create(club=club, text=text)
                        count += 1

                    self.stdout.write(
                        self.style.SUCCESS(
                            "Extracted {} testimonial(s) from {}".format(
                                count, club.code)))
Пример #2
0
 def fix_clubs(self):
     self.clubs_to_scrape = []
     self.process_url(self.START_URL)
     for club, url in self.clubs_to_scrape:
         desc = self.extract_club_desc(url)
         if desc is None:
             continue
         club.description = clean(desc.text.strip())
         if not self.dry_run:
             club.save()
         self.stdout.write(f"Fixing club {club.name}.")
         self.club_count += 1
     self.stdout.write(f"Updated {self.club_count} clubs!")
Пример #3
0
    def add_ics_events(self):
        """
        Fetch the ICS events from the club's calendar URL and return the number of modified events.
        """
        # random but consistent uuid used to generate uuid5s from invalid uuids
        ics_import_uuid_namespace = uuid.UUID(
            "8f37c140-3775-42e8-91d4-fda7a2e44152")

        extractor = URLExtract()

        url = self.ics_import_url
        if url:
            calendar = Calendar(requests.get(url).text)
            event_list = Event.objects.filter(is_ics_event=True, club=self)
            modified_events = []
            for event in calendar.events:
                tries = [
                    Event.objects.filter(club=self,
                                         start_time=event.begin.datetime,
                                         end_time=event.end.datetime).first(),
                    Event(),
                ]

                # try matching using uuid if it is valid
                if event.uid:
                    try:
                        event_uuid = uuid.UUID(event.uid[:36])
                    except ValueError:
                        # generate uuid from malformed/invalid uuids
                        event_uuid = uuid.uuid5(ics_import_uuid_namespace,
                                                event.uid)

                    tries.insert(
                        0,
                        Event.objects.filter(ics_uuid=event_uuid).first())
                else:
                    event_uuid = None

                for ev in tries:
                    if ev:
                        ev.club = self
                        ev.name = event.name.strip()
                        ev.start_time = event.begin.datetime
                        ev.end_time = event.end.datetime
                        ev.description = clean(event.description.strip())
                        ev.location = event.location
                        ev.is_ics_event = True

                        # very simple type detection, only perform on first time
                        if ev.pk is None:
                            ev.type = Event.OTHER
                            for val, lbl in Event.TYPES:
                                if val in {Event.FAIR}:
                                    continue
                                if (lbl.lower() in ev.name.lower() or
                                        lbl.lower() in ev.description.lower()):
                                    ev.type = val
                                    break

                        # extract urls from description
                        if ev.description:
                            urls = extractor.find_urls(ev.description)
                            urls.sort(
                                key=lambda url: any(
                                    domain in url for domain in {
                                        "zoom.us",
                                        "bluejeans.com",
                                        "hangouts.google.com",
                                    }),
                                reverse=True,
                            )
                            if urls:
                                ev.url = urls[0]

                        # extract url from url or location
                        if event.url:
                            ev.url = event.url
                        elif ev.location:
                            location_urls = extractor.find_urls(ev.location)
                            if location_urls:
                                ev.url = location_urls[0]

                        # format url properly with schema
                        if ev.url:
                            parsed = urlparse(ev.url)
                            if not parsed.netloc:
                                parsed = parsed._replace(netloc=parsed.path,
                                                         path="")
                            if not parsed.scheme:
                                parsed = parsed._replace(scheme="https")
                            ev.url = parsed.geturl()

                        # add uuid if it exists, otherwise will be autogenerated
                        if event_uuid:
                            ev.ics_uuid = event_uuid

                        # ensure length limits are met before saving
                        if ev.location:
                            ev.location = ev.location[:255]
                        if ev.name:
                            ev.name = ev.name[:255]
                        if ev.code:
                            ev.code = ev.code[:255]
                        if ev.url:
                            ev.url = ev.url[:2048]

                        ev.save()
                        modified_events.append(ev)
                        break

            event_list.exclude(pk__in=[e.pk for e in modified_events]).delete()
            return len(modified_events)
        return 0
Пример #4
0
    def process_url(self, url):
        self.stdout.write("Processing Page {}".format(self.count))
        self.count += 1
        resp = self.session.get(url)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.content, "html.parser")
        grps = soup.select(".grpl .grpl-grp")
        for grp in grps:
            name = grp.select_one("h3 a").text.strip()
            image_url = urljoin(url, grp.select_one("img")["src"]).strip()
            if image_url.endswith("/group_img.png"):
                image_url = None
            group_tag = grp.select_one(".grpl-type")
            if group_tag is not None:
                group_type = group_tag.text.strip()
            else:
                group_type = None
            description = grp.select_one(".grpl-purpose").text.replace(
                "\r\n", "\n").strip()
            if description == "This group has not written a purpose":
                description = ""
            else:
                description = clean(description)
            contact_tag = grp.select_one(".grpl-contact")
            if contact_tag is not None:
                contact_email = contact_tag.text.strip()
            else:
                contact_email = None

            if group_type is not None and not self.dry_run and not self.skip_tags:
                tag = Tag.objects.get_or_create(name=group_type)[0]
            else:
                tag = None
            clubs = Club.objects.filter(name__iexact=name)
            if clubs.exists():
                if clubs.count() > 1:
                    raise CommandError(
                        "Club with name '{}' exists twice!".format(name))
                club = clubs.first()
                flag = False
            else:
                code = slugify(name)
                if not self.dry_run:
                    club, flag = Club.objects.get_or_create(code=code)
                elif Club.objects.filter(code=code).exists():
                    club = Club.objects.get(code=code)
                    flag = False
                else:
                    club = Club(code=code)
                    flag = True

            # only overwrite blank fields
            if not club.name:
                club.name = name
            if not club.description:
                club.description = description
            use_image = False
            if image_url:
                if not self.dry_run:
                    if club.image:
                        resp = requests.head(image_url, allow_redirects=True)
                        use_image = not resp.ok
                    else:
                        use_image = True
                    if use_image:
                        resp = requests.get(image_url, allow_redirects=True)
                        resp.raise_for_status()
                        club.image.save(os.path.basename(image_url),
                                        ContentFile(resp.content))
            if not club.email:
                club.email = contact_email

            # mark newly created clubs as inactive (has no owner)
            if flag:
                club.active = False
            if not self.dry_run:
                club.save()
                if tag is not None and not club.tags.count():
                    club.tags.set([tag])
            self.club_count += 1
            self.stdout.write("{} '{}' (image: {})".format(
                "Created" if flag else "Updated", name, use_image))

        next_tag = soup.find(text="Next >")
        if next_tag is not None:
            next_link = next_tag.find_parent("a")["href"]
            next_url = url.split("?", 1)[0] + next_link
            self.process_url(next_url)
Пример #5
0
    def process_url(self, url):
        self.stdout.write(f"Processing Page {self.count}")
        self.count += 1
        resp = self.session.get(url)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.content, "html.parser")
        grps = soup.select(".grpl .grpl-grp")
        for grp in grps:
            # parse name
            name = grp.select_one("h3 a").text.strip()

            # parse image url
            image_url = urljoin(url, grp.select_one("img")["src"]).strip()
            if image_url.endswith("/group_img.png"):
                image_url = None

            # parse tag
            group_tag = grp.select_one(".grpl-type")
            if group_tag is not None:
                group_type = group_tag.text.strip()
            else:
                group_type = None

            # parse description
            description = grp.select_one(".grpl-purpose").text.replace(
                "\r\n", "\n").strip()
            if description == "This group has not written a purpose":
                description = ""
            else:
                description = clean(description)

            # parse email contact
            contact_tag = grp.select_one(".grpl-contact")
            if contact_tag is not None:
                contact_email = contact_tag.text.strip()
            else:
                contact_email = None

            # create or update tag
            if group_type is not None and not self.dry_run and not self.skip_tags:
                tag = Tag.objects.get_or_create(name=group_type)[0]
            else:
                tag = None

            # don't include parentheses content in code
            slug_name = re.sub(r"\(.+?\)$", "", name).strip()

            # create or update club
            code = slugify(slug_name)
            club = fuzzy_lookup_club(name)
            if club is not None:
                code = club.code
                flag = False
            else:
                club = Club(code=code)
                flag = True

            if not flag and self.create_only:
                self.ignore_count += 1
                self.stdout.write(f"Ignoring {name}, club already exists")
                continue

            # only overwrite blank fields
            if not club.name:
                club.name = name
            if not club.description:
                club.description = description

            # only update image if existing image is nonexistent/broken link
            # if image is local and set, assume that it exists
            use_image = False
            if image_url:
                if not self.dry_run:
                    if club.image:
                        if club.image.url.startswith("http"):
                            resp = requests.head(club.image.url,
                                                 allow_redirects=True)
                            use_image = not resp.ok
                        else:
                            use_image = False
                    else:
                        use_image = True

                    if use_image:
                        resp = requests.get(image_url, allow_redirects=True)
                        resp.raise_for_status()
                        club.image.save(os.path.basename(image_url),
                                        ContentFile(resp.content))
                else:
                    use_image = not bool(club.image)

            # update email if there is no email
            if not club.email:
                club.email = contact_email

            # mark newly created clubs as inactive (has no owner)
            if flag:
                club.active = False

            if not self.dry_run:
                with transaction.atomic():
                    club.save()
                    if tag is not None and not club.tags.count():
                        club.tags.set([tag])

            self.club_count += 1
            action_verb = "Created" if flag else "Updated"
            out_string = f"{action_verb} '{name}' (image: {use_image})"
            if flag:
                self.stdout.write(self.style.SUCCESS(out_string))
                self.create_count += 1
            else:
                self.stdout.write(out_string)
                self.update_count += 1

        next_tag = soup.find(text="Next >")
        if next_tag is not None:
            next_link = next_tag.find_parent("a")["href"]
            next_url = url.split("?", 1)[0] + next_link
            self.process_url(next_url)