Пример #1
0
    def generate_rows(self,
                      dataset_schema=None,
                      dataset_partitioning=None,
                      partition_id=None,
                      records_limit=-1):
        (beg_date, end_date) = dkuwikipedia.get_daterange(self.config)

        for line in self.config.get("pages", "").split("\n"):
            line = line.strip()
            line = line.split(" ")
            project = line[0]
            page = " ".join(line[1:])

            project = project.strip()

            print "Query for %s : %s" % (project, page)
            resp = dkuwikipedia.query_page(project, page, beg_date, end_date)
            dic = resp.json()
            for item in dic.get("items", []):
                yield {
                    "project":
                    project,
                    "date":
                    dkuwikipedia.parse_and_format_yyyymmddhh(
                        item["timestamp"]),
                    "page":
                    item["article"],
                    "views":
                    item["views"],
                }
Пример #2
0
    def generate_rows(self,
                      dataset_schema=None,
                      dataset_partitioning=None,
                      partition_id=None,
                      records_limit=-1):
        (beg_date, end_date) = dkuwikipedia.get_daterange(self.config)
        projects = dkuwikipedia.get_projects(self.config)

        cur_date = beg_date
        while cur_date < end_date:
            for project in projects:

                print "Query for %s : %s" % (cur_date, project)
                resp = dkuwikipedia.query_top(project, cur_date)
                dic = resp.json()
                for item in dic.get("items", [{
                        "articles": []
                }])[0]["articles"]:
                    yield {
                        "project": project,
                        "date": dkuwikipedia.format_date(cur_date),
                        "page": item["article"],
                        "views": item["views"],
                        "rank": item["rank"]
                    }
            cur_date = cur_date + datetime.timedelta(days=1)
Пример #3
0
    def generate_rows(self, dataset_schema=None, dataset_partitioning=None,
                            partition_id=None, records_limit = -1):
        (beg_date, end_date) = dkuwikipedia.get_daterange(self.config)
        projects = dkuwikipedia.get_projects(self.config)

        cur_date = beg_date
        while cur_date < end_date:
            for project in projects:

                print "Query for %s : %s" % (cur_date, project)
                resp = dkuwikipedia.query_top(project, cur_date)
                dic = resp.json()
                for item in dic.get("items", [{"articles": []}])[0]["articles"]:
                    yield {
                        "project" : project,
                        "date" : dkuwikipedia.format_date(cur_date),
                        "page" : item["article"],
                        "views" : item["views"],
                        "rank" : item["rank"]
                    }
            cur_date = cur_date + datetime.timedelta(days=1)
Пример #4
0
    def generate_rows(self, dataset_schema=None, dataset_partitioning=None,
                            partition_id=None, records_limit = -1):
        (beg_date, end_date) = dkuwikipedia.get_daterange(self.config)

        for line in self.config.get("pages", "").split("\n"):
            line = line.strip()
            line = line.split(" ")
            project = line[0]
            page = " ".join(line[1:])

            project = project.strip()

            print "Query for %s : %s" % (project, page)
            resp = dkuwikipedia.query_page(project, page, beg_date, end_date)
            dic = resp.json()
            for item in dic.get("items", []):
                yield {
                    "project" : project,
                    "date" : dkuwikipedia.parse_and_format_yyyymmddhh(item["timestamp"]),
                    "page" : item["article"],
                    "views" : item["views"],
                }
Пример #5
0
import dataiku
from dataiku.customrecipe import *
import dkuwikipedia

config = get_recipe_config()
(beg_date, end_date) = dkuwikipedia.get_daterange(config)

pages_list_dataset = dataiku.Dataset(get_input_names_for_role('pages_list')[0])

def get_rows():
    for item in pages_list_dataset.iter_rows():
        project = item["project"]
        page = item["page"]

        print "Query for %s : %s" % (project, page)
        resp = dkuwikipedia.query_page(project, page, beg_date, end_date)
        dic = resp.json()
        for item in dic.get("items", []):
            yield {
                "project" : project,
                "date" : dkuwikipedia.parse_and_format_yyyymmddhh(item["timestamp"]),
                "page" : item["article"],
                "views" : item["views"],
            }

out_dataset = dataiku.Dataset(get_output_names_for_role('main')[0])

schema_columns = [
    {"name" : "project", "type" : "string"},
    {"name" : "page", "type" : "string"},
    {"name" : "date", "type" : "date"},
Пример #6
0
import dataiku
from dataiku.customrecipe import *
import dkuwikipedia

config = get_recipe_config()
(beg_date, end_date) = dkuwikipedia.get_daterange(config)

pages_list_dataset = dataiku.Dataset(get_input_names_for_role('pages_list')[0])


def get_rows():
    for item in pages_list_dataset.iter_rows():
        project = item["project"]
        page = item["page"]

        print "Query for %s : %s" % (project, page)
        resp = dkuwikipedia.query_page(project, page, beg_date, end_date)
        dic = resp.json()
        for item in dic.get("items", []):
            yield {
                "project": project,
                "date":
                dkuwikipedia.parse_and_format_yyyymmddhh(item["timestamp"]),
                "page": item["article"],
                "views": item["views"],
            }


out_dataset = dataiku.Dataset(get_output_names_for_role('main')[0])

schema_columns = [{