Пример #1
0
class YellowTaxiDateRangeTask(luigi.WrapperTask):
    start = luigi.MonthParameter()
    stop = luigi.MonthParameter()

    def requires(self):
        current_month = self.start
        while current_month <= self.stop:
            yield CopyTaxiTripData2SQLite(date=current_month)
            current_month += relativedelta(months=1)
Пример #2
0
class PandasDFDemo(luigi.Task):
    """ Print a sample dataframe """

    month: datetime.date = luigi.MonthParameter(default=datetime.date.today())
    profession: str = luigi.Parameter(default="Engineer")

    def requires(self):
        return GenerateCustomers(month=self.month)

    def output(self):
        return luigi.LocalTarget(
            f"data/customers#{self.profession}_{self.month}.txt")

    def run(self):
        with self.input().open() as customer_file:
            customer_df = pandas.read_csv(
                customer_file,
                names=[
                    "Name", "Address", "Birthdate", "Job", "Company", "Email"
                ],
            )

        with self.output().open("w") as outfile:
            outfile.write(str(customer_df))

        return customer_df
Пример #3
0
class NominetDomainListToHDFS(luigi.Task):
    """
    """
    date = luigi.MonthParameter(default=datetime.date.today())

    task_namespace = 'ingest'

    def requires(self):
        return NominetDomainListFTP(date=self.date)

    def output(self):
        filename = "/1_data/nominet/domains.%s.csv.gz" % self.date.strftime(
            '%Y%m')
        return luigi.contrib.hdfs.HdfsTarget(path=filename,
                                             format=WebHdfsPlainFormat())

    def run(self):
        # Read the file in and write it to HDFS
        with self.input().open('r') as reader:
            with self.output().open('w') as writer:
                logger.info("Uploading %s to %s" %
                            (self.input().path, self.output().path))
                while True:
                    chunk = reader.read(DEFAULT_BUFFER_SIZE)
                    if not chunk:
                        break
                    writer.write(chunk)
Пример #4
0
class NominetDomainListFTP(luigi.ExternalTask):
    """
    Remote SFTP service and filenaming pattern for monthly releases.

    NOTE that for this to work, the host key must be set up and known to the server that runs this task. e.g.
    a `ssh USER@HOST` check to get the key registered will be needed to set up a new server or if the remote server changes.

    """
    date = luigi.MonthParameter(default=datetime.date.today())

    task_namespace = 'ingest'

    def output(self):
        """
        Returns the target output for this task.
        In this case, a successful execution of this task will create a file that will be created in a FTP server.
        :return: the target output for this task.
        :rtype: object (:py:class:`~luigi.target.Target`)
        """
        filename = '/home/bl/domains.%s.csv.gz' % self.date.strftime('%Y%m')
        return luigi.contrib.ftp.RemoteTarget(filename,
                                              NOM_HOST,
                                              username=NOM_USER,
                                              password=NOM_PWD,
                                              sftp=True)
Пример #5
0
class GenerateCustomers(luigi.Task):
    """ Generate :count:-many customers from :month:. """

    month: datetime.date = luigi.MonthParameter()
    count: int = luigi.IntParameter(default=10000)

    def output(self):
        return luigi.LocalTarget(f"data/customers_{self.month}.csv")

    def run(self):
        fake = faker.Faker()

        with self.output().open("w") as outfile:
            writer = csv.writer(outfile)
            for _ in range(self.count):
                writer.writerow([
                    fake.name(),
                    fake.address().replace("\n", " "),
                    fake.date_between_dates(
                        date_start=datetime.date(1930, 1, 1),
                        date_end=datetime.date(2000, 1, 1),
                    ),
                    fake.job(),
                    fake.company(),
                    fake.company_email(),
                ])
Пример #6
0
class CustomerSalaries(luigi.Task):
    """ Load the customers into a dataframe """

    month: datetime.date = luigi.MonthParameter(default=datetime.date.today())
    job: str = luigi.Parameter(default="Engineer")

    def output(self):
        return luigi.LocalTarget(f"data/salaries_{self.job}_{self.month}.csv")

    def requires(self):
        return {
            "a": GenerateCustomers(month=self.month),
            "b": CompanyEngineerSalary(month=self.month, job=self.job),
        }

    def run(self):
        with self.input()["a"].open() as customer_file:
            customer_df = pandas.read_csv(
                customer_file,
                names=[
                    "Name", "Address", "Birthdate", "Job", "Company", "Email"
                ],
            )

        with self.input()["b"].open() as salaries_file:
            salaries_df = pandas.read_csv(
                salaries_file,
                delimiter=",",
                names=["Position", "Company", "Salary"])

        employees_df = customer_df[['Name', 'Company']]
        employee_salaries_df = employees_df.merge(salaries_df, on="Company")
        employee_salaries_df.to_csv(self.output().path)
        print(employee_salaries_df.head())
Пример #7
0
class FollowFilteredEdgelist(luigi.Task):
    '''edgelistの左側にunknownが出て来るエッジを消して、居住地の付けたユーザからのデータのみにしたエッジリスト

    Args:
        --name LocationUserListとUnknownListがわかるように保存パスに使われる名前
        --month
    '''
    month = luigi.MonthParameter()
    name = luigi.Parameter()
    type = luigi.ChoiceParameter(choices=['followers', 'following'])
    sources = luigi.TupleParameter(default=('followers', 'following'))

    def requires(self):
        return {
            'edgelist': TwitterFollowRawEdgelist(month=self.month,
                                                 type=self.type),
            'hl': RemainedHomeLocation(name=self.name, month=self.month)
        }

    def output(self):
        return luigi.LocalTarget(
            os.path.join(
                NETWORK_DIR, 'filtered', self.name,
                self.month.strftime('%Y%m_{}.tsv.gz'.format(self.type))))

    def run(self):
        with self.output().temporary_path() as temp_output_path:
            cmd = 'zcat {edgelist.path} | python -m snlocest.scripts.edgefilter -i {hl.path} | gzip > {}'.format(
                temp_output_path, **self.input())
            run(cmd, shell=True, check=True)
Пример #8
0
 def testSerialize(self):
     date = datetime.date(2013, 2, 3)
     self.assertEqual(luigi.DateParameter().serialize(date), '2013-02-03')
     self.assertEqual(luigi.YearParameter().serialize(date), '2013')
     self.assertEqual(luigi.MonthParameter().serialize(date), '2013-02')
     dt = datetime.datetime(2013, 2, 3, 4, 5)
     self.assertEqual(luigi.DateHourParameter().serialize(dt), '2013-02-03T04')
Пример #9
0
class WriteRollingAveragesToDB(sqla.CopyToTable):
    month = luigi.MonthParameter()

    def requires(self):
        return CalculateTripDurationRollingAverage45Days(month=self.month)

    columns = [
        (["date", sa.DATE], {
            "primary_key": True
        }),
        (["rolling_average_45d", sa.Float], {
            "nullable": True
        }),
    ]
    connection_string = settings.db.url
    table = "trip_duration_rolling_average"
    column_separator = ","

    def rows(self):
        for date_str, duration in super().rows():
            date = parse(date_str).date()
            if date.month != self.month.month:
                continue
            if not duration:
                duration = None
            yield date, duration
Пример #10
0
class Indicator(luigi.Task):
    pair = luigi.Parameter()
    exchange = luigi.Parameter()
    month = luigi.MonthParameter()
    period = luigi.Parameter(default="1d")
    destination_path = luigi.Parameter()
    FN = None
    COLUMN_NAME = ""

    def column_name(self):
        return self.COLUMN_NAME

    def output(self):
        parms = self.to_str_params()
        cls = self.__class__.__name__
        parms["class"] = cls
        path = hamp.path(hamp.DEFINITIONS[cls], **parms)
        path = os.path.join(self.destination_path, path)
        self.target = luigi.LocalTarget(path)
        yield self.target

    def run(self):
        self.target.makedirs()
        data = hamt.input_df(self.requires())
        name = self.column_name()
        data[name] = self.FN(data)
        next_m = hamt.next_month(self.month, False)
        data = data[self.month:next_m]
        data[[name]].to_csv(self.target.path, date_format=hamt.DATE_FORMAT)
Пример #11
0
class NYTaxiTripDurationAnalytics(luigi.WrapperTask):
    month = luigi.MonthParameter()

    def requires(self):
        yield WriteDailyAveragesToDB(self.month)
        yield WriteMonthlyAveragesToDB(self.month)
        yield WriteRollingAveragesToDB(self.month)
Пример #12
0
class RemainedHomeLocation(luigi.Task):
    '''作成した居住地データ(LocationuserList)からunknownになったユーザをひいて、
    ソーシャルネットワークを取得しているuserlistとANDをとったものを保存する

    Args:
        --homelocation-path 居住地データのファイルへのパス
    '''
    name = luigi.Parameter()
    month = luigi.MonthParameter()
    sources = luigi.TupleParameter(default=('followers', 'following'))
    homelocation_path = luigi.Parameter()

    def requires(self):
        return {
            'unknown': UnknownList(month=self.month, sources=self.sources),
            'userlist': LocationUserList(path=self.homelocation_path),
            'seed': SeedUserList(month=self.month)
        }

    def output(self):
        return luigi.LocalTarget(
            os.path.join('data/datasets', self.name, 'groundtruth',
                         os.path.basename(self.input()['userlist'].path)))

    def run(self):
        cmd = 'cat {userlist.path} | python -m snlocest.scripts.edgefilter -e {unknown.path} | python -m snlocest.scripts.edgefilter -i {seed.path} > {}'
        with self.output().temporary_path() as temp_output_path:
            run(cmd.format(temp_output_path, **self.input()),
                shell=True,
                check=True)
Пример #13
0
class CalculateRollingAverage(luigi.Task):
    """ Task for calculating the rolling average for 45 days. """

    year_month = luigi.MonthParameter()

    def requires(self):
        return SaveDailyAverage(self.year_month)

    def run(self):
        df = pd.read_sql_table(
            "daily_average_duration",
            con=self.input().engine,
            parse_dates=["date"],
            index_col="date",
        )

        rolling_avg = s.rolling_average_n_days(df, num_of_days=45)
        rolling_avg.to_csv(self.output().path)

    def output(self):
        return luigi.LocalTarget(
            os.path.join(
                os.getenv("DATA_DIR"),
                f"rolling_average_{u.year_month_to_str(self.year_month)}.csv",
            ))
Пример #14
0
class IngestData(luigi.WrapperTask):
    """ Task that starts the data pipeline. """

    year_month = luigi.MonthParameter()

    def requires(self):
        yield CalculateRollingAverage(self.year_month)
class DownloadDataByDate(luigi.Task):
    '''
    Download by year and month as date string (formatted YYYY-MM)
    and by taxi color (green or yellow)
    '''

    date = luigi.MonthParameter()
    taxi_color = luigi.Parameter(default='yellow')

    def output(self):
        return luigi.LocalTarget('tmp/taxi_{color}.{date}.csv'.format(
            date=self.date, color=self.taxi_color))

    def download(self):
        url = download_url.format(color=self.taxi_color,
                                  year=self.date,
                                  month=self.date)
        shell('wget -P {output} {url}'.format(output=self.output().path,
                                              url=url))

    def run(self):
        try:
            self.output().makedirs()
            self.download()
        except Exception as err:
            os.remove(self.output().path)
            raise
Пример #16
0
class FollowSocialNetworks(luigi.WrapperTask):
    month = luigi.MonthParameter()
    name = luigi.Parameter()

    def requires(self):
        networks = [
            MutualNetwork, FollowerNetwork, FolloweeNetwork, LinkedNetwork
        ]
        return [N(month=self.month, name=self.name) for N in networks]
Пример #17
0
class SeedUserList(luigi.ExternalTask):
    month = luigi.MonthParameter()
    basedir = luigi.Parameter(default='data/twitter-following-followers-geo')
    number = luigi.IntParameter(
        default=2
    )  # 2が2014年にある市区町村で5回以上ツイートがあるユーザ、1はそれに加えて2014年に365回以上のツイートがあるユーザ

    def output(self):
        return luigi.LocalTarget(
            os.path.join(self.basedir, 'user_id_{}.txt'.format(self.number)))
Пример #18
0
class TwitterFollowingFollowers(luigi.ExternalTask):
    '''Twitterユーザの関係データ
    '''
    month = luigi.MonthParameter()
    type = luigi.ChoiceParameter(choices=['followers', 'following'])
    basedir = luigi.Parameter(default='data/twitter-following-followers-geo')

    def output(self):
        return luigi.LocalTarget(
            os.path.join(
                self.basedir,
                self.month.strftime('%Y%m-{}.tar.gz'.format(self.type))))
Пример #19
0
        class Bar(luigi.Task):
            month = luigi.MonthParameter()

            def __init__(self, *args, **kwargs):
                super(Bar, self).__init__(*args, **kwargs)
                self.comp = False

            def run(self):
                self.comp = True

            def complete(self):
                return self.comp
Пример #20
0
class GetNYTaxiMontlyData(luigi.Task):
    month = luigi.MonthParameter()

    def run(self):
        url = h.get_url(self.month)
        self.output().makedirs()
        fname = h.download(url)
        self.output().fs.move(fname, self.output().path)

    def output(self):
        return luigi.LocalTarget(
            os.path.join(settings.local_cache_dir,
                         f"taxi_data_{self.month}.csv"))
Пример #21
0
class FingridMonthlyTask(FingridTask, luigi.Task):
    measurement_name = luigi.ChoiceParameter(
        choices=fingrid.MEASUREMENTS.keys())
    month = luigi.MonthParameter()

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

        assert self.month.day == 1
        start_time = datetime.combine(self.month, datetime.min.time())
        # First of next month
        end_time = (start_time + timedelta(days=32)).replace(day=1)
        self.fingrid_init(start_time, end_time)
Пример #22
0
class DownloadData(luigi.Task):
    """ Task for downloading the data. """

    year_month = luigi.MonthParameter()

    def output(self):
        local_path = u.get_local_path(self.year_month)
        return luigi.LocalTarget(local_path)

    def run(self):
        if not os.path.exists(os.getenv("DATA_DIR")):
            os.makedirs(os.getenv("DATA_DIR"))
        local_path = u.download_data(self.year_month)
Пример #23
0
class DownloadGeolite2CityDatabase(luigi.Task):
    task_namespace = "dc"
    date = luigi.MonthParameter(default=datetime.datetime.today())

    download = "http://geolite.maxmind.com/download/geoip/database/GeoLite2-City.tar.gz"
    match_glob = "GeoLite2-Country_*/GeoLite2-City.mmdb"

    def output(self):
        return luigi.LocalTarget("GeoLite2-City-%s.mmdb" % self.date)

    def run(self):
        os.system("curl -O %s" % self.download)
        os.system("tar xvfz GeoLite2-City.tar.gz")
        os.system("cp %s %s" % (self.match_glob, self.output().path))
Пример #24
0
class CalculateTripDurations(luigi.Task):
    month = luigi.MonthParameter()

    def requires(self):
        return CleanUpTaxiData(month=self.month)

    def output(self):
        return luigi.LocalTarget(
            os.path.join(settings.local_cache_dir,
                         f"trip_durations_{self.month}.pickle"))

    def run(self):
        df = pd.read_pickle(self.input().path)
        trip_durations = p.calculate_durations(df).pipe(p.reindex_on_pickup)
        trip_durations.to_pickle(self.output().path)
Пример #25
0
class CleanUpTaxiData(luigi.Task):
    month = luigi.MonthParameter()

    def requires(self):
        return GetNYTaxiMontlyData(month=self.month)

    def output(self):
        return luigi.LocalTarget(
            os.path.join(settings.local_cache_dir,
                         f"taxi_data_clean_{self.month}.pickle"))

    def run(self):
        df = (p.load_csv(self.input().path).pipe(p.rename_columns).pipe(
            p.filter_by_month_year, self.month))
        df.to_pickle(self.output().path)
Пример #26
0
class KgdTaxPaymentsForMonth(Runner):
    """  As result we get three files:
        payments data in .csv file, all processed bins in .prs file and
        bins with no payments data for given month
    """
    month = luigi.MonthParameter(default=previous_month(1))
    date = luigi.Parameter(default=datetime.today().replace(day=1))
    name = 'kgd_taxpayments'

    def requires(self):
        # start_date, end_date = month_as_dates_range(self.month)

        yield GzipKgdTaxPaymentsToFtp(suff=self.suff,
                                      period=month_as_dates_range(self.month),
                                      struct=TaxPaymentsRow,
                                      **self.params)
Пример #27
0
class WriteDailyAveragesToDB(sqla.CopyToTable):
    month = luigi.MonthParameter()

    def requires(self):
        return CalculateDailyAverageTripDuration(month=self.month)

    columns = [(["date", sa.DATE], {
        "primary_key": True
    }), (["duration", sa.Float], {})]
    connection_string = settings.db.url
    table = "trip_duration_daily_average"
    column_separator = ","

    def rows(self):
        for date_str, duration in super().rows():
            yield parse(date_str).date(), duration
Пример #28
0
class CalculateMonthlyAverageTripDuration(luigi.Task):
    month = luigi.MonthParameter()

    def requires(self):
        return CalculateTripDurations(month=self.month)

    def output(self):
        return luigi.LocalTarget(
            os.path.join(
                settings.local_cache_dir,
                f"trip_duration_monthly_average_{self.month}.csv",
            ))

    def run(self):
        df = pd.read_pickle(self.input().path)
        daily_averages = p.monthly_average_durations(df)
        daily_averages.to_csv(self.output().path, header=False)
Пример #29
0
class FolloweeNetwork(luigi.Task):
    month = luigi.MonthParameter()
    name = luigi.Parameter()

    def requires(self):
        return MasterFollowEdgelist(month=self.month, name=self.name)

    def output(self):
        return luigi.LocalTarget(
            os.path.join('data/datasets', self.name, 'networks',
                         'f_followee.tsv'))

    def run(self):
        with self.output().temporary_path() as temp_output_path:
            run('zcat {} > {}'.format(self.input().path, temp_output_path),
                shell=True,
                check=True)
Пример #30
0
class LinkedNetwork(luigi.Task):
    month = luigi.MonthParameter()
    name = luigi.Parameter()

    def requires(self):
        return MasterFollowEdgelist(month=self.month, name=self.name)

    def output(self):
        return luigi.LocalTarget(
            os.path.join('data/datasets', self.name, 'networks',
                         'f_linked.tsv'))

    def run(self):
        with self.output().temporary_path() as temp_output_path:
            cmd = '''zcat %s | awk -F"\t" 'BEGIN{OFS="\t"}{print $1,$2;print $2,$1}' | LC_ALL=C sort | LC_ALL=C uniq > %s ''' % (
                self.input().path, temp_output_path)
            run(cmd, shell=True, check=True)