def generate_rows(self, dataset_schema=None, dataset_partitioning=None, partition_id=None, records_limit=-1): (beg_date, end_date) = dkuwikipedia.get_daterange(self.config) for line in self.config.get("pages", "").split("\n"): line = line.strip() line = line.split(" ") project = line[0] page = " ".join(line[1:]) project = project.strip() print "Query for %s : %s" % (project, page) resp = dkuwikipedia.query_page(project, page, beg_date, end_date) dic = resp.json() for item in dic.get("items", []): yield { "project": project, "date": dkuwikipedia.parse_and_format_yyyymmddhh( item["timestamp"]), "page": item["article"], "views": item["views"], }
def generate_rows(self, dataset_schema=None, dataset_partitioning=None, partition_id=None, records_limit=-1): (beg_date, end_date) = dkuwikipedia.get_daterange(self.config) projects = dkuwikipedia.get_projects(self.config) cur_date = beg_date while cur_date < end_date: for project in projects: print "Query for %s : %s" % (cur_date, project) resp = dkuwikipedia.query_top(project, cur_date) dic = resp.json() for item in dic.get("items", [{ "articles": [] }])[0]["articles"]: yield { "project": project, "date": dkuwikipedia.format_date(cur_date), "page": item["article"], "views": item["views"], "rank": item["rank"] } cur_date = cur_date + datetime.timedelta(days=1)
def generate_rows(self, dataset_schema=None, dataset_partitioning=None, partition_id=None, records_limit = -1): (beg_date, end_date) = dkuwikipedia.get_daterange(self.config) projects = dkuwikipedia.get_projects(self.config) cur_date = beg_date while cur_date < end_date: for project in projects: print "Query for %s : %s" % (cur_date, project) resp = dkuwikipedia.query_top(project, cur_date) dic = resp.json() for item in dic.get("items", [{"articles": []}])[0]["articles"]: yield { "project" : project, "date" : dkuwikipedia.format_date(cur_date), "page" : item["article"], "views" : item["views"], "rank" : item["rank"] } cur_date = cur_date + datetime.timedelta(days=1)
def generate_rows(self, dataset_schema=None, dataset_partitioning=None, partition_id=None, records_limit = -1): (beg_date, end_date) = dkuwikipedia.get_daterange(self.config) for line in self.config.get("pages", "").split("\n"): line = line.strip() line = line.split(" ") project = line[0] page = " ".join(line[1:]) project = project.strip() print "Query for %s : %s" % (project, page) resp = dkuwikipedia.query_page(project, page, beg_date, end_date) dic = resp.json() for item in dic.get("items", []): yield { "project" : project, "date" : dkuwikipedia.parse_and_format_yyyymmddhh(item["timestamp"]), "page" : item["article"], "views" : item["views"], }
import dataiku from dataiku.customrecipe import * import dkuwikipedia config = get_recipe_config() (beg_date, end_date) = dkuwikipedia.get_daterange(config) pages_list_dataset = dataiku.Dataset(get_input_names_for_role('pages_list')[0]) def get_rows(): for item in pages_list_dataset.iter_rows(): project = item["project"] page = item["page"] print "Query for %s : %s" % (project, page) resp = dkuwikipedia.query_page(project, page, beg_date, end_date) dic = resp.json() for item in dic.get("items", []): yield { "project" : project, "date" : dkuwikipedia.parse_and_format_yyyymmddhh(item["timestamp"]), "page" : item["article"], "views" : item["views"], } out_dataset = dataiku.Dataset(get_output_names_for_role('main')[0]) schema_columns = [ {"name" : "project", "type" : "string"}, {"name" : "page", "type" : "string"}, {"name" : "date", "type" : "date"},
import dataiku from dataiku.customrecipe import * import dkuwikipedia config = get_recipe_config() (beg_date, end_date) = dkuwikipedia.get_daterange(config) pages_list_dataset = dataiku.Dataset(get_input_names_for_role('pages_list')[0]) def get_rows(): for item in pages_list_dataset.iter_rows(): project = item["project"] page = item["page"] print "Query for %s : %s" % (project, page) resp = dkuwikipedia.query_page(project, page, beg_date, end_date) dic = resp.json() for item in dic.get("items", []): yield { "project": project, "date": dkuwikipedia.parse_and_format_yyyymmddhh(item["timestamp"]), "page": item["article"], "views": item["views"], } out_dataset = dataiku.Dataset(get_output_names_for_role('main')[0]) schema_columns = [{