def studio(): if request.method == "GET": common.connect_db() return render_template("studio.html", schemas=list(schema.Challenge.objects().order_by("-modified"))) # yapf: disable else: scraper = Scraper() sid = scraper.get_id(request.form["studio"]) s = None if request.form["schema"] != "__none__": s = request.form["schema"] if request.form["studio"] == "__all__": scrape.rescrape_all.delay(cache_directory=CACHE_DIRECTORY) return "Started" elif sid is not None: scrape.add_studio.delay(sid, schema=s, show=("show" in request.form), cache_directory=CACHE_DIRECTORY) return redirect("/studio/{0}".format(sid)) else: return render_template( "studio.html", message="Please enter a valid studio ID or URL.")
def test_studio(): scrape = Scraper() projects = scrape.get_projects_in_studio(26211962) p_to_s = dict() for project in projects: p_to_s[project] = 26211962 return projects, p_to_s
def project_download(): if request.form["sid"] is None or request.form["pid"] is None: return "False" sid = request.form["sid"] pid = request.form["pid"] scraper = Scraper() try: pid = int(pid) except: return "False" if pid in scraper.get_projects_in_studio(sid): return str(scrape.add_project(pid, sid, CACHE_DIRECTORY)) else: return "False"
def generate_summary_page(credentials_file=settings.DEFAULT_CREDENTIALS_FILE): """Performs all the aggregation required to generate the summary page. Args: credentials_file (str): path to the database credentials file. Returns: None. """ logging.info("starting to aggregate summary statistics") # Stitch the project images together img = get_stitched(get_image_urls(), 16, w=96, h=72) img.save( "{}/data/projects.jpg".format(settings.CACHE_DIRECTORY), dpi=(72, 72), # disable: yapf quality=75) logging.info("project image stitch saved, starting on data gathering") # Get the data now = datetime.now() studios = get_ordered_studios() studio_ids = [s["studio_id"] for s in studios] engagement = get_total_engagement(studio_ids) data = { "project_counts": [s["stats"]["total"]["number_projects"] for s in studios], "nations": get_author_origins(get_unique_authors(studio_ids)), "totals": { "block_count": sum([s["stats"]["total"]["block_count"] for s in studios]), "categories": get_total_categories(studios), "comments": sum([s["stats"]["total"]["comments_left"] for s in studios]), "description": sum([s["stats"]["total"]["description_words"] for s in studios]), "hearts_stars": engagement["loves"] + engagement["favorites"], "projects": sum([s["stats"]["total"]["number_projects"] for s in studios]), "unique_authors": len(get_unique_authors(studio_ids)) }, "updated": now.strftime("%A, %B %d, %Y") } with open("{}/lib/data/summary.json".format(settings.PROJECT_DIRECTORY)) as f: # yapf: disable static = json.load(f) data["static"] = static["statistics"] if Scraper().make_dir("{}/data".format(settings.CACHE_DIRECTORY)): with open("{}/data/summary.json".format(settings.CACHE_DIRECTORY), "w") as f: # yapf: disable json.dump(data, f) logging.info("completed aggregating summary statistics") return True
def get_author_origins(authors): """Gets the origin locations of project authors. Args: authors (array-like): a set of authors for whom origin locations are to be counted. Returns: A dictionary mapping countries to number of authors from there. """ nations = dict() scraper = Scraper() for author in authors: user = scraper.get_user_info(author) if user["profile"]["country"] in nations: nations[user["profile"]["country"]] += 1 else: nations[user["profile"]["country"]] = 1 return nations
def add_comments(project_id, username, credentials_file=settings.DEFAULT_CREDENTIALS_FILE): """Inserts a project's comments into the database. These are public comments on the project itself, not code comments. Args: project_id (int): the ID of the project whose comments we're scraping. username (str): the username of the user who created the project. credentials_file (str): path to the database credentials file. Returns: None. """ # DB connection connect_db(credentials_file=credentials_file) # Scrape comments scraper = Scraper() comments = scraper.get_project_comments(project_id) for comment in comments: preexisting = Comment.objects(project_id=project_id, comment_id=comment["id"]).first() if not preexisting: timestamp = datetime.strptime(comment["timestamp"], "%Y-%m-%dT%H:%M:%SZ") doc = Comment(comment_id=comment["id"], project_id=project_id, date=timestamp, author=comment["username"].lower(), recipient=username.lower(), content=comment["comment"]) doc.save() logging.debug("successfully scraped comments for project {}".format(project_id)) # yapf: disable
def generate_certs(usernames, credentials_file=settings.DEFAULT_CREDENTIALS_FILE, cache_directory=settings.CACHE_DIRECTORY): """Initiates the generation of all Getting Unstuck certificates. Args: usernames (array-like): list of usernames to create and scrape certificates for. credentials_file (str): path to the database credentials file. cache_directory (str): if set, will save this certificate into the cache directory specified. Returns: None. """ Scraper().make_dir(f"{cache_directory}/certificates") logging.info("attempting to generate certificates") connect_db(credentials_file=credentials_file) # Get schema IDs, and add to a reusable query that will get all the projects that have one of the schemas schema_ids = scrape.Studio.objects( public_show=True).values_list("challenge_id") query = [] for schema_id in schema_ids: query.append({f"validation.{schema_id}": {"$exists": True}}) projects = scrape.Project.objects(__raw__={"$or": query}) # Loop through each username to generate certificate for username in usernames: # Get number of projects completed author_count = projects.filter(author=username).count() if author_count > 10: logging.info( "certificate for {} has more than 10 projects! reset to 10". format(username)) author_count = 10 # Generate certificate cert_download = convert_cert("pdf.html", username, author_count, cache_directory) if not cert_download: logging.info("certificate download failed for {}".format(username)) logging.info("certificate generation completed!")
def main(): scrape = Scraper() arguments = get_arguments() projects, projects_to_studio = get_project_ids(scrape, arguments) if arguments.output_directory is None: scrape.download_projects(projects, projects_to_studio, file_name=arguments.output_name) else: scrape.download_projects(projects, projects_to_studio, output_directory=arguments.output_directory, file_name=arguments.output_name)
def scraper(): return Scraper()
def add_studio(studio_id, schema=None, show=False, cache_directory=None, credentials_file=settings.DEFAULT_CREDENTIALS_FILE): """Scrapes a studio and inserts it into the database. Args: studio_id (int): the ID of the studio to scrape. schema (str): the object ID of the schema associated with this studio. show (bool): whether to show the studio on the public Challenges page. cache_directory (str): if set, will save this project JSON into the cache directory specified. credentials_file (str): path to the database credentials file. Returns: None. Raises: IOError: if couldn't write the JSON file to the given cache_directory. """ # Load scraper class scraper = Scraper() # Add individual studio to DB studio_info = scraper.get_studio_meta(studio_id) if studio_info is not None: logging.info("attempting to scrape studio {}".format(studio_id)) connect_db(credentials_file=credentials_file) preexisting = Studio.objects(studio_id=studio_id).first() if preexisting: # Update a few fields doc = preexisting doc.title = studio_info["title"] doc.description = studio_info["description"] doc.status = "in_progress" if show is not None: doc.public_show = show else: # New studio altogether doc = Studio(studio_id=studio_id, title=studio_info["title"], description=studio_info["description"], status="in_progress", public_show=show) if schema is not None: doc.challenge_id = schema doc.save() # Add all the projects project_ids = scraper.get_projects_in_studio(studio_id) # Delete projects no longer in studio delete = Project.objects(studio_id=studio_id, project_id__nin=project_ids) logging.info("deleting {} projects no longer in studio {}" .format(delete.count(), studio_id)) delete.delete() # Add to studio for i, project in enumerate(project_ids): add_project(project, studio_id=studio_id, cache_directory=cache_directory, credentials_file=credentials_file) if i % 10 == 0: logging.info("completed {}/{} projects in studio {}" .format(i, len(project_ids), studio_id)) stats = get_studio_stats(studio_id, credentials_file=credentials_file) preexisting = Studio.objects(studio_id=studio_id).first() if preexisting is not None: preexisting.status = "complete" preexisting.stats = stats preexisting.save() logging.info("successfully scraped studio {}".format(studio_id))
def add_project(project_id, studio_id=0, cache_directory=None, credentials_file=settings.DEFAULT_CREDENTIALS_FILE): """Inserts a project into the database after scraping it. Updates existing database entries. Args: project_id (int): the ID of the project to scrape. studio_id (int): the studio ID with which this project should be associated. cache_directory (str): if set, will save this project JSON into the cache directory specified. credentials_file (str): path to the database credentials file. Returns: True, if a new insertion or if updated a record. False if Scratch 2. Raises: IOError: if couldn't write the JSON file to the given cache_directory. """ # Gather information about the project scraper = Scraper() metadata = scraper.get_project_meta(project_id) # Handle error from trying to decode ZIPs try: scratch_data = scraper.download_project(project_id) except RuntimeError: scratch_data = dict() # Convert to SB3 if possible parser = Parser() if not parser.is_scratch3(scratch_data) and settings.CONVERT_URL != "": try: r = requests.post(settings.CONVERT_URL, json=scratch_data) scratch_data = json.loads(r.json()) except: pass # Save to cache if needed if cache_directory is not None: if scraper.make_dir(f"{cache_directory}/projects"): name = "{0}/projects/{1}.json".format(cache_directory, project_id) # yapf: disable with open(name, "w") as f: try: json.dump(scratch_data, f) except: raise IOError( "Couldn't write the JSON file to directory {0}".format(cache_directory)) # yapf: disable # Parse the project using the parser class try: if parser.is_scratch3(scratch_data): stats = parser.blockify(scratch_data=scratch_data) if stats["blocks"] == False or stats["categories"] == False: stats = False else: stats = False except: stats = False if not stats: logging.warning("Couldn't get statistics for project {}".format(project_id)) # yapf: disable return False # Change block_text's form text_new = {"text": [], "blocks": []} for text in stats["block_text"]: text_new["text"].append(text) text_new["blocks"].append(stats["block_text"][text]) stats["block_text"] = text_new # Check database for existing project with project_id connect_db(credentials_file=credentials_file) preexisting = Project.objects(project_id=project_id).first() if preexisting: # Update a few fields doc = preexisting doc.title = metadata["title"] doc.description = metadata["description"] doc.instructions = metadata["instructions"] doc.author = metadata["author"]["username"].lower() doc.image = metadata["image"] doc.history = metadata["history"] doc.remix = metadata["remix"] doc.stats = stats doc.engagement = metadata["stats"] if studio_id > 0: doc.studio_id = studio_id if cache_directory is not None: doc.cache_expires = datetime.now() + timedelta(days=30) else: # Create a new record doc = Project(project_id=project_id, title=metadata["title"], description=metadata["description"], instructions=metadata["instructions"], author=metadata["author"]["username"].lower(), image=metadata["image"], history=metadata["history"], remix=metadata["remix"], engagement=metadata["stats"], studio_id=studio_id, stats=stats) doc.save() add_comments(project_id, metadata["author"]["username"].lower(), credentials_file=credentials_file) # Validate against studio's schema, if available if studio_id > 0: challenge = Studio.objects( studio_id=studio_id).only("challenge_id").first() if challenge is not None and challenge["challenge_id"] is not None: validation = schema.validate_project( challenge["challenge_id"], project_id, studio_id, credentials_file=credentials_file) del validation["_id"] doc.validation[str(challenge["challenge_id"])] = validation doc.save() logging.debug("successfully scraped project {}".format(project_id)) return True
def get_project_page(pid, cache_directory=settings.CACHE_DIRECTORY): """Get a project page rendered in HTML given a project ID. Args: pid (int): project ID. cache_directory (str): the directory where cached projects are stored. Returns: A string containing the HTML for the page. """ # Load in the project db, project JSON, studio info, and schema project, scratch_data = scrape.get_project(pid, cache_directory) if len(project) == 0 or len(scratch_data) == 0: message = 'We couldn’t find your project! \ <a href="/project/r/{}">Try again</a>'.format(pid) return render_template("project_loader.html", message=message) studio = scrape.get_studio(project["studio_id"]) if "challenge_id" in studio: sc = schema.get_schema(studio["challenge_id"]) # Determine whether there's an error here err = False if str(studio["challenge_id"]) in project["validation"]: project["validation"] = project["validation"][str(studio["challenge_id"])] # yapf: disable else: err = True # Show error page if project == {} or scratch_data == {} or studio == {} or sc == {} or err: raise NotFound() # Prepare helper tools scraper = Scraper() visualizer = Visualizer() # Convert Markdown to HTML with Scratchblocks if "text" in sc: for key in sc["text"]: sc["text"][key] = common.md(sc["text"][key]) # Get the code excerpt for the projects to be shown excerpts = dict() examples = get_comparisons(project, sc, 5) + [project] for example in examples: code, sprite = get_code_excerpt(example, sc) excerpts[example["project_id"]] = { "author": example["author"], "code": code, "sprite": sprite } # Get the saved reflection, if any _reflections = scrape.ProjectReflection.objects( project_id=pid).order_by("-timestamp") try: reflection = _reflections.first().to_mongo().to_dict() reflection["editable"] = True if reflection[ "gu_uid"] == request.cookies.get("_gu_uid") else False except: reflection = dict() else: sc = dict() excerpts = dict() reflection = dict() # One prompt variable to take the logic out of the templating language prompt = { "title": sc["title"] if "title" in sc and sc["title"] is not None else studio["title"] if "title" in studio else None, "description": sc["description"] if "description" in sc else studio["description"] if "description" in studio else None } # Choose stats to show studio["stats"] = get_studio_stats(sc, studio) # Get the feels feels = get_feels(randomize=True) return render_template("project.html", prompt=prompt, project=project, studio=studio, schema=sc, excerpts=excerpts, feels=feels, reflection=reflection)