class FetchBuildDetails(BaseBDroneTask):
    requires = Requires()
    builds = Requirement(FetchBuilds)
    timestamp = Parameter()

    @delayed
    def fetch_build_details(self, repo, build_id, repo_id):
        resp_json = self.get_result(f"{repo}/builds/{build_id}")
        df = pd.DataFrame(resp_json, columns=[*columns, 'stages']).fillna('None')
        df['repo_id'] = repo_id

        return ddf.from_pandas(df, npartitions=1)

    def __init__(self, timestamp):
        super(FetchBuildDetails, self).__init__(timestamp)
        self.output_path = os.path.join(get_base_output_path(), timestamp, "details")

    def run(self):
        print("########## Fetching Build Details ############")
        df = self.input()["builds"].read_dask()
        fetches = list(df
                       .apply(lambda r: self.fetch_build_details(r["repo_name"], r["number"], r["repo_id"]), axis=1)
                       .compute())

        dfs = ddf.concat([*compute(*fetches)], axis=0)\
            .set_index('number')\
            .map_partitions(lambda x: x.sort_index())

        self.output().write_dask(dfs)

    def output(self):
        return CSVTarget(
            path=self.output_path + os.path.sep,
            glob="*.part"
        )
示例#2
0
class download_slides(Task):
    """
    Download raw slides from S3 to local.
    """
    LOCAL_ROOT = 'raw_slides'
    course_name = Parameter('CSCI-E29')

    requires = Requires()
    data = Requirement(raw_slides)

    output = TargetOutput(file_pattern=os.path.join(LOCAL_ROOT,'{task.course_name}'), ext='')

    def run(self):
        logger = logging.getLogger('luigi-interface')
        logger.info("-----------Start downloading pdf's--------------")

        if isinstance(self.data.output(), S3Target):
            os.mkdir(self.output().path)
            for file in self.data.output().fs.list(self.data.output().path):
                if file:
                    self.data.output().fs.get(s3_path=self.data.output().path+r'/'+file,
                                              destination_local_path=os.path.join(self.output().path, file))
        else:
            copytree(self.data.output().path, self.output().path)
        logger.info("-----------Finish downloading pdf's--------------")
示例#3
0
class parse_pdf_to_single_page(Task):
    """
    Parse the downloaded slides to single page slide files.
    """
    LOCAL_ROOT = r'pdf.js/static/slides'
    course_name = Parameter('CSCI-E29')
    requires = Requires()
    data = Requirement(download_slides)

    output = TargetOutput(file_pattern=os.path.join(LOCAL_ROOT,'{task.course_name}'), ext='')

    def run(self):
        logger = logging.getLogger('luigi-interface')
        logger.info("-----------Start parsing pdf's--------------")
        os.mkdir(self.output().path)
        raw_pdf_file = sorted(glob.glob(os.path.join(self.data.output().path, '*.pdf')))

        for fn in raw_pdf_file:
            logger.info(fn)
            inputpdf = PdfFileReader(open(fn, "rb"))
            folder_name = os.path.join(self.output().path, os.path.basename(fn).replace('.pdf', ''))
            os.mkdir(folder_name)
            for i in range(inputpdf.numPages):
                output = PdfFileWriter()
                output.addPage(inputpdf.getPage(i))
                file_name = os.path.join(folder_name, "{course_name}----{folder}----slide{slide_num}.pdf".format(
                    course_name=self.course_name, folder=os.path.basename(fn).replace('.pdf', ''), slide_num=str(i)))

                with open(file_name, "wb") as outputStream:
                    output.write(outputStream)

        logger.info("-----------Finish parsing pdf's--------------")
class GenerateFaqJsonFromHtml(BaseGenerateTask):
    requires = Requires()
    downloadtmltemplate = Requirement(DownloadHTMLTemplate)

    output = TargetOutput(file_pattern="data/cdcfaq.json",
                          ext="",
                          target_class=LocalTarget)

    def run(self):
        # Use self.output() and self.input() targets to atomically copy
        # the file locally!
        with self.downloadtmltemplate.output().open('r') as inf:
            parser = etree.HTMLParser()
            tree = etree.parse(inf, parser)

            r = tree.xpath('//span[@role="heading"]/../../../..')

            lst = []

            for v in r:
                hdr = v.xpath('div/div/button/span[@role="heading"]/text()')
                bdy = v.xpath('div/div/div[@class="card-body"]/p/text()')
                q = " ".join(hdr)

                if len(bdy) == 0:
                    bdy = v.xpath(
                        'div/div/div[@class="card-body"]/ul/li/text()')

                tmpAns = " ".join(bdy)
                a = tmpAns[:970] if (len(tmpAns) > 1000) else tmpAns

                lst.append({"q": q, "a": a})
            lst = lst[0:self.number]
            with self.output().open('w') as outf:
                json.dump(lst, outf, indent=2)
class GenerateExcel(BaseGenerateTask):
    requires = Requires()
    geneeratejson = Requirement(GenerateFaqJsonFromHtml)

    output = TargetOutput(file_pattern="data/cdcfaq.csv",
                          ext="",
                          target_class=LocalTarget)

    def run(self):
        # Use self.output() and self.input() targets to atomically copy
        # the file locally!
        with self.geneeratejson.output().open('r') as inf:
            data = []
            qas = json.load(inf)
            for qa_num, qa in enumerate(qas):
                answer = qa["a"]
                q = qa["q"]
                q = q.replace(",", " ")
                if not answer.strip():
                    continue
                if not q.strip():
                    continue
                data.append({"question": q, "answer": answer})

            df = pd.DataFrame(data)

            with self.output().open('w') as outf:
                df.to_csv(outf, index=False, compression='gzip')
class PlotResults(Task):
    """Luigi task that uses prediction and saves plots"""

    __version__ = "1.0"

    data_source = Parameter()
    output_pred = Parameter()
    output_model = Parameter()
    train_loc = Parameter()

    requires = Requires()
    req_1 = Requirement(ConvNeuralTest)
    LOCAL_ROOT = os.path.join(os.getcwd(), "data")

    path = os.path.join(LOCAL_ROOT, "{task.__class__.__name__}-{salt}.png")

    output = TargetOutput(file_pattern=path,
                          target_class=SuffixPreservingLocalTarget,
                          ext="")

    def run(self):
        """
        Function that loads the prediction and call the show_cam plotting method
        """
        features = np.load(self.req_1.output().path)
        test_path = self.req_1.output().path.rstrip("features.npy")
        results = np.load(test_path + "results.npy")
        gap_weights_l = np.load(test_path + "gap_weights_l.npy",
                                allow_pickle=True)
        test_image = np.load(test_path + "image.npy")

        show_cam(gap_weights_l, results, features, test_image,
                 self.output().path)
class LocalImageReduced(Task):
    """Luigi external task that returns a target for a small subset of train data"""

    __version__ = "1.0"

    requires = Requires()
    req_1 = Requirement(LocalImage)
    LOCAL_ROOT = os.path.join(os.getcwd(), "data")
    LOCAL_IMAGE = os.path.join(LOCAL_ROOT, "OCTReduced")

    output = TargetOutput(
        file_pattern=LOCAL_IMAGE, target_class=SuffixPreservingLocalTarget, ext=""
    )

    def run(self):
        """
        This function goes through the train/test directories and the subdirectories inside for each class and takes
        a small sample of images and copies them into a new directory.
        """
        rootdir = self.req_1.output().path
        newpath = self.output().path
        for src_dir, dirs, files in os.walk(rootdir):
            dst_dir = src_dir.replace(rootdir, newpath, 1)
            if not os.path.exists(dst_dir):
                os.makedirs(dst_dir)
            counter = 0
            for file_ in files:
                src_file = os.path.join(src_dir, file_)
                shutil.copy(src_file, dst_dir)
                counter += 1
                if "train" in dst_dir and counter > 50:
                    break
                elif "test" in dst_dir and counter > 10:
                    break
        class MyTask(Task):
            requires = Requires()
            other = Requirement(OtherTask)

            def run(self):
                x = MyTask
                z = MyTask.requires.__call__(self)
                requirement_taskname = z.get("other")
                assert type(requirement_taskname) == OtherTask
示例#9
0
class StoreFront(Task):
    requires = Requires()
    logged_in = Requirement(LoggedIn)
    output = StoreFrontTarget(merchant=MERCHANT_NAME)

    def run(self):
        get_browser(merchant=MERCHANT_NAME).get(
            "https://www.instacart.com/store/wegmans/storefront")
        sleep(14)
class FetchBuilds(BaseBDroneTask):
    requires = Requires()
    date_range = Requirement(DateRange)
    timestamp = Parameter()

    def __init__(self, timestamp):
        super(FetchBuilds, self).__init__(timestamp)
        self.output_path = os.path.join(get_base_output_path(), timestamp, "builds")

    @delayed
    def fetch_data(self, repo, repo_id, start_date):
        page_no = 1

        dfs = []

        while True:
            payload = {"page": page_no, "branch": "main"}
            resp_json = self.get_result(
                f"{repo}/builds",
                payload
            )

            if len(resp_json) > 0:
                pd_df = pd.DataFrame(resp_json, columns=columns).fillna('None')
                dfs.append(ddf.from_pandas(pd_df, npartitions=1))

                if resp_json[:-1][0]["started"] < int(start_date.strftime('%s')):
                    break

                page_no += 1
            else:
                break

        df = ddf.concat(dfs)
        df["repo_id"] = repo_id
        df["repo_name"] = repo

        return df

    def run(self):
        print("########## Fetching Builds ############")
        date_range = self.input()["date_range"].get_date_range()
        self.input()["date_range"].mark_fetching()

        targets = [
            self.fetch_data(dr["repo"], dr["repo_id"], dr["start_date"])
            for dr in date_range
        ]

        dfs = ddf.concat([*compute(*targets)], axis=0)
        self.output().write_dask(dfs)

    def output(self):
        return CSVTarget(
            path=self.output_path + os.path.sep,
            glob="*.part"
        )
示例#11
0
class LoginPage(Task):
    requires = Requires()
    browser_open = Requirement(BrowserOpen)
    output = LoginPageTarget(merchant=MERCHANT_NAME)

    def run(self):
        buttons = get_browser(
            merchant=MERCHANT_NAME).find_elements_by_css_selector("button")
        login_button = buttons[0]
        login_button.click()
        sleep(5)
示例#12
0
class InfoModalDeliveryTimes(Task):
    requires = Requires()
    info_modal = Requirement(InfoModal)
    output = InfoModalDeliveryTimesTarget(merchant=MERCHANT_NAME)

    def run(self):
        find_by_text(
            get_browser(merchant=MERCHANT_NAME),
            "Delivery times",
        )[0].click()
        sleep(8)
示例#13
0
class InfoModal(Task):
    requires = Requires()
    main_page = Requirement(StoreFront)
    trial_prompt_closed = Requirement(CloseTrialPrompt)
    output = InfoModalTarget(merchant=MERCHANT_NAME)

    def run(self):
        cart_button = get_browser(
            merchant=MERCHANT_NAME).find_element_by_css_selector(
                'a[href="/wegmans/info?tab=info"]')
        cart_button.click()
        sleep(7)
示例#14
0
class CloseTrialPrompt(Task):
    requires = Requires()
    store_front = Requirement(StoreFront)
    output = TrialPromptClosedTarget(merchant=MERCHANT_NAME)

    def run(self):
        buttons = find_by_text(
            get_browser(merchant=MERCHANT_NAME),
            "Got it, Thanks",
        )
        if len(buttons) > 0:
            buttons[0].click()
            sleep(2)
class StoreBuilds(Task):
    requires = Requires()
    details = Requirement(FetchBuildDetails)
    timestamp = Parameter()

    def run(self):
        df = self.input()['details'].read_dask()
        df.set_index('number')
        df = df.map_partitions(lambda x: x.sort_index())
        self.output().insert_data(df)
        self.output().mark_fetched()

    def output(self):
        return StoreBuildsTarget()
class DownloadHTMLTemplate(BaseContent):
    requires = Requires()
    contenthtmltemplate = Requirement(ContentHtml)

    output = TargetOutput(
        file_pattern="data/cdcfaq.htm",
        ext="",
        target_class=LocalTarget,
        format=format.Nop
    )

    def run(self):
        # Use self.output() and self.input() targets to atomically copy
        # the file locally!
        with self.contenthtmltemplate.output().open('r') as inf, self.output().open('w') as outf:
            outf.write(inf.read())
class DownloadBotTemplate(BaseContent):
    requires = Requires()
    contentbottemplate = Requirement(ContentBotTemplate)

    output = TargetOutput(
        file_pattern="data/templates/Covidbot_template.json",
        ext="",
        target_class=LocalTarget,
        format=format.Nop
    )

    def run(self):
        # Use self.output() and self.input() targets to atomically copy
        # the file locally!
        with self.contentbottemplate.output().open('r') as inf, self.output().open('w') as outf:
            outf.write(inf.read())
示例#18
0
class CountryDimension(Task):
    """Luigi Task to create the country dimension table for lookups."""

    requires = Requires()
    raw = Requirement(DownloadCSV)

    parent_directory = Path(os.path.dirname(
        os.path.realpath(__file__))).parent.parent
    target_filename = os.path.join(str(parent_directory),
                                   "data/country_dimension.csv")

    def output(self):
        """Specifies the LocalTarget output for the task."""
        return LocalTarget(self.target_filename)

    def run(self):
        """Writes the country level statistics to data/country_dimension.csv"""
        raw_data = self.input()["raw"].read_dask(filename="*.part").compute()

        country_dimension = (raw_data.groupby(["location"]).agg({
            "median_age":
            "max",
            "aged_65_older":
            "max",
            "aged_70_older":
            "max",
            "gdp_per_capita":
            "max",
            "cardiovasc_death_rate":
            "max",
            "diabetes_prevalence":
            "max",
            "female_smokers":
            "max",
            "male_smokers":
            "max",
            "handwashing_facilities":
            "max",
            "life_expectancy":
            "max",
            "human_development_index":
            "max",
        }).reset_index())

        country_dimension.to_csv(self.target_filename, index=False)
示例#19
0
class LatestWeeklyData(Task):
    """Luigi Task which identifies the latest weekly snapshot for each country."""

    requires = Requires()
    other = Requirement(AggregateWeeklyData)

    parent_directory = Path(os.path.dirname(
        os.path.realpath(__file__))).parent.parent
    target_filename = os.path.join(str(parent_directory),
                                   "data/latest_data.csv")

    def output(self):
        """Specifies the LocalTarget output for the task."""
        return LocalTarget(self.target_filename)

    def run(self):
        """Identifies the latest summary statistic by country and writes the results to data/latest_data.csv"""
        with self.input()["other"].open("r") as f:
            data = f.readlines()

        rows = [ele.strip().split(",") for ele in data]
        column_names = rows.pop(0)
        weekly_data = pd.DataFrame(rows, columns=column_names)
        datatypes = {
            "week": str,
            "new_cases": np.float64,
            "stringency_index": np.float64,
            "total_deaths": np.float64,
            "population": np.float64,
            "total_tests": np.float64,
        }
        weekly_data = weekly_data.astype(datatypes)

        max_week = weekly_data.groupby("location").agg({
            "week": "max"
        }).reset_index()
        latest_data = weekly_data.merge(max_week)
        latest_data["death_per_population_pct"] = (
            latest_data["total_deaths"] * 100 / latest_data["population"])
        latest_data["tests_per_population"] = (latest_data["total_tests"] /
                                               latest_data["population"])
        latest_data.to_csv(self.target_filename, index=False)
示例#20
0
class extract_text(Task):
    """
    Extract the text information from each page of slides
    """
    LOCAL_ROOT = r'slides'
    course_name = Parameter('CSCI-E29')
    requires = Requires()
    data = Requirement(parse_pdf_to_single_page)

    output = TargetOutput(file_pattern=LOCAL_ROOT + '_{task.course_name}',
                          ext='')

    def run(self):
        logger = logging.getLogger('luigi-interface')
        logger.info(
            "-----------Start extracting text from pdf's--------------")

        os.mkdir(self.output().path)
        single_pdf_file = sorted(
            glob.glob(os.path.join(self.data.output().path, r'**/*.pdf')))
        content = []
        label = []
        for fn in single_pdf_file:
            if fn.endswith('slide1.pdf'):
                logger.info(fn.rsplit(r'----', 1)[0])
            text = textract.process(fn)
            text = text.decode("utf-8")
            content.append(tokenizer(text))
            label.append(
                os.path.basename(fn).replace('----', '##').replace('.pdf', '')
                + '\n')

        logger.info(
            "-----------Finish extracting text from pdf's--------------")
        with atomic_write(
                os.path.join(self.output().path,
                             '{}.dat'.format(self.output().path))) as fn:
            fn.writelines(content)
        with atomic_write(
                os.path.join(self.output().path, '{}.dat.labels'.format(
                    self.output().path))) as fn:
            fn.writelines(label)
示例#21
0
class CleanedReviews(Task):
    __version__ = "1.0.0"
    subset = BoolParameter(default=True)
    requires = Requires()
    task2 = Requirement(YelpReviews)
    parquet_data = "./yelpdata/"

    output = TargetOutput(file_pattern=parquet_data, ext="", target_class=ParquetTarget)

    def run(self):

        df = self.input()["task2"].read_dask(check_complete=True)


        df = df[(df.user_id.notnull()) & (df.review_id.str.len() == 22)]
        values = {"funny": 0, "cool": 0, "useful": 0, "stars": 0}
        df = df.fillna(value=values)
        df = df.astype({"funny": int, "cool": int, "useful": int, "stars": int})

        self.output().write_dask(collection=df, compression="gzip")
class ConvNeural(ExternalPythonProgramTask):
    """
    Luigi Task to run a shell script that builds and activates a new venv
    """

    __version__ = "1.0"

    data_source = Parameter()
    output_pred = Parameter()
    output_model = Parameter()

    requires = Requires()

    LOCAL_ROOT = os.path.join(os.getcwd(), "data")
    LOCAL_IMAGE = os.path.join(LOCAL_ROOT, "OCTReduced")

    extra_pythonpath = os.getcwd()
    virtualenv = os.path.join(extra_pythonpath,
                              "final_project/tasks/ML_env/.venv")

    def program_args(self):
        "executes a shell script where a new venv is created for ML envirionment and the train or test code is run"
        data_path = os.path.join(self.LOCAL_ROOT, self.data_source)
        ml_path = "final_project/tasks/ML_env"
        model_path = self.temp_output_path
        if self.__class__.__name__ == "ConvNeuralTest":
            model_path = self.input().path

        return [
            "./external_script.sh",
            data_path,
            self.temp_output_path,
            ml_path,
            self.virtualenv,
            self.action,
            model_path,
        ]

    def run(self):
        with self.output().temporary_path() as self.temp_output_path:
            super().run()
示例#23
0
class LoggedIn(Task):
    requires = Requires()
    login_page = Requirement(LoginPage)
    output = LoggedInTarget(merchant=MERCHANT_NAME)

    def run(self):
        browser = get_browser(merchant=MERCHANT_NAME)
        email_form = browser.find_element_by_id(
            "nextgen-authenticate.all.log_in_email")
        # TODO: get email from task param
        email_form.send_keys(os.environ["EMAIL"])
        sleep(5)
        password_form = browser.find_element_by_id(
            "nextgen-authenticate.all.log_in_password")
        # TODO: get password from task param
        password_form.send_keys(os.environ["PASSWORD"])
        sleep(5)
        buttons = browser.find_elements_by_css_selector("button")
        login_button = buttons[2]
        login_button.click()
        sleep(14)  # TODO: random delays
class GenerateBot(BaseGenerateTask):
    requires = Requires()
    geneeratejson = Requirement(GenerateFaqJsonFromHtml)
    downloadbottemplate = Requirement(DownloadBotTemplate)
    COVID_INTENTBASE = "covid"

    output = TargetOutput(file_pattern="data/bot/cdcfaqbot.json",
                          ext="",
                          target_class=LocalTarget)

    def run(self):
        # Use self.output() and self.input() targets to atomically copy
        # the file locally!
        with self.geneeratejson.output().open('r') as inf:
            qas = json.load(inf)
            with self.downloadbottemplate.output().open('r') as template:
                covidbot_templates = json.load(template)

            covidbotresource = covidbot_templates
            covid_intents = covidbotresource["resource"]["intents"]
            for qa_num, qa in enumerate(qas):
                answer = qa["a"]
                q = qa["q"]
                q = q.replace(",", " ")
                if not answer.strip():
                    continue
                if not q.strip():
                    continue

                intent_name = f"{self.COVID_INTENTBASE}{num_to_char(qa_num)}"
                lex_template = json.loads(INTENT_JSON)
                resource = lex_template
                resource["name"] = intent_name
                resource["sampleUtterances"] = gen_sample_utterances(q, qa)
                resource["conclusionStatement"]["messages"][0][
                    "content"] = answer
                covid_intents.append(resource)

            with self.output().open('w') as outf:
                json.dump(covidbot_templates, outf, indent=2)
示例#25
0
class AggregateWeeklyData(Task):
    """Luigi task which aggregates covid daily stats by week."""

    requires = Requires()
    other = Requirement(DownloadCSV)

    parent_directory = Path(os.path.dirname(
        os.path.realpath(__file__))).parent.parent
    target_filename = os.path.join(str(parent_directory),
                                   "data/weekly_data.csv")

    def output(self):
        """Specifies the LocalTarget output for the task"""
        return LocalTarget(self.target_filename)

    def run(self):
        """Aggregates case volume, stringency index and additional stats by country and week.
        Writes the results to data/weekly_data.csv
        """
        raw_data = self.input()["other"].read_dask(filename="*.part").compute()
        raw_data["date"] = pd.to_datetime(raw_data.date)
        raw_data["week"] = raw_data["date"].apply(
            lambda x: x - pd.Timedelta(days=x.weekday()))
        raw_data["week"] = raw_data["week"].dt.date.apply(lambda x: str(x))

        weekly_data = (raw_data.groupby(["location", "week"]).agg({
            "new_cases":
            "sum",
            "stringency_index":
            "max",
            "total_deaths":
            "max",
            "population":
            "max",
            "total_tests":
            "max",
        }).reset_index().dropna())

        weekly_data.to_csv(self.target_filename, index=False)
示例#26
0
class CleanedHeadlines(Task):
    '''
    This class loads the data from the AWS instance if it exists and preprocesses the data for analysis 
    The class returns a dataframe with pre-processed reviews that can be loaded in the Topic Modeling class 
    for additional analysis 

    :input : s3 Path to Article Headlines 
    :output : Creates a Local Parquet File with the preprocessed data 
    '''
    subset = BoolParameter(default=True)
    requires = Requires()
    article_headlines = Requirement(ArticleHeadlines)
    date = datetime.datetime.now()
    date_suffix = str(date.month) + '_' + str(date.day) + '_' + str(date.year)

    output = TargetOutput(
        target_class=ParquetTarget,
        ext='-' + date_suffix,
        glob="*.parquet",
    )

    def run(self):
        dsk = self.input()['article_headlines'].read_dask(
            dtype={
                "publish_date": "int32",
                "headline_text": "str",
                "headline_id": "str"
            },
            storage_options=dict(requester_pays=True),
        )

        # dsk_df = dsk.compute()
        headlines_concat = "".join(dsk["headline_id"])
        headlines_hash = hash_str(headlines_concat, get_csci_salt()).hex()[:8]
        self.output().write_dask(dsk, compression="gzip")

    def print_results(self):
        print(self.output().read_dask().compute())
示例#27
0
class BySomething(Task):
    __version__ = "1.0.0"

    requires = Requires()
    task3 = Requirement(CleanedReviews)
    # Be sure to read from CleanedReviews locally

    output = TargetOutput(
        file_pattern="./yelpdata/",
        ext=".parquet",
        target_class=ParquetTarget,
        flag=None,
        glob="*.parquet",
        storage_options=dict(requester_pays=True),
    )

    def run(self):
        df = self.input()["task3"].read_dask(
            check_complete=True, column=["year", "stars", "text"]
        )
        df["decade"] = (df["date"].dt.year // 10) % 10
        df["text_length"] = df["text"].str.len()
        by_decade = df[["decade", "text_length"]].groupby("decade").mean().round()
        by_star = df[["stars", "text_length"]].groupby("stars").mean().round()

        df.to_csv("./yelpdata/YelpReviewCleansed/*.csv", index=False, sep=',', encoding='utf-8')

        self.by_decade = by_decade
        self.by_star = by_star
        print(by_decade.compute())
        print(by_star.compute())

    def print_by_decade(self):
        print(self.by_decade.compute())

    def print_by_star(self):
        print(self.by_decade.compute())
示例#28
0
class GetDeliveryTimes(Task):
    requires = Requires()
    main_page = Requirement(InfoModalDeliveryTimes)

    # get_time_window() makes sure we don't run this more than once every 5 minutes
    output = TargetOutput(
        file_pattern=os.path.join(
            "data", "delivery_times_{}".format(get_time_window()), ""),
        ext=".parquet",
        target_class=ParquetTarget,
    )

    def run(self):
        # Commented out because it significantly increases run time
        # self.detect_load_more_times_button()
        if self.detect_no_deliveries():
            self.output().write_dask(
                dd.from_pandas(pd.DataFrame([]), chunksize=1))
        else:
            self.output().write_dask(
                dd.from_pandas(self.detect_delivery_times(), chunksize=1))

    def detect_delivery_times(self):
        header = find_by_text(
            get_browser(merchant=MERCHANT_NAME),
            "Available Scheduled Times",
        )[0]
        section = get_parent(get_parent(header))
        return pd.DataFrame(self.parse_delivery_times(section.text))

    def parse_delivery_times(self, text):
        delivery_times = []
        for line in text.splitlines():
            if is_date(line):
                date = line
            if is_time(line):
                delivery_times.append({"date": date, "time": line})
            if is_money(line):
                delivery_times[-1]["price"] = line
        return delivery_times

    def detect_load_more_times_button(self):
        while True:
            try:
                button = get_browser(
                    merchant=MERCHANT_NAME).find_element_by_xpath(
                        '//button[text()="More times"]')
                button.click()
                sleep(5)
            except NoSuchElementException:
                return

    def detect_no_deliveries(self):
        try:
            get_browser(merchant=MERCHANT_NAME).find_element_by_css_selector(
                'img[alt="All delivery windows are full"]')
            return True
        except:
            return False

    def print_results(self):
        print(self.get_results())

    def get_results(self):
        return self.output().read_dask().compute()
示例#29
0
class similarity_calc(Task):
    """
    Calculate similarity based on BM 25 model, and return the top 10 most related pages of each slide.
    """
    LOCAL_ROOT = r'static/ranking_results'
    course_name = Parameter('CSCI-E29')
    requires = Requires()
    data = Requirement(extract_text)

    output = TargetOutput(
        file_pattern=os.path.join(LOCAL_ROOT, '{task.course_name}') + r'/',
        ext='',
        target_class=CSVTarget)

    def run(self):
        logger = logging.getLogger('luigi-interface')
        logger.info("-----------Start calculating similarity--------------")
        cfg_course_path = os.path.join(self.data.output().path, 'config.toml')
        if not os.path.exists(cfg_course_path):
            cfg_course = toml.load('config.toml')
            cfg_course['dataset'] += '_' + self.course_name
            cfg_course['index'] += '_' + self.course_name
            with atomic_write(cfg_course_path, 'w') as f:
                f.write(toml.dumps(cfg_course))

        idx = metapy.index.make_inverted_index(cfg_course_path)
        ranker = metapy.index.OkapiBM25()
        top_k = 10
        query = metapy.index.Document()
        with open(
                os.path.join(self.data.output().path,
                             '{}.dat.labels'.format(self.data.output().path)),
                'r') as fn:
            label_list = fn.read().splitlines()
        with open(
                os.path.join(self.data.output().path,
                             '{}.dat'.format(self.data.output().path)),
                'r') as fn:
            txt_list = fn.read().splitlines()
        out = pd.DataFrame(columns=range(21))
        for i in range(len(label_list)):
            if not i % 10:
                logger.info('processing---{}/{}'.format(
                    str(i), len(label_list)))
            row = [label_list[i]]
            query.content(txt_list[i])
            result = ranker.score(idx, query, top_k + 1)
            if len(result) > 1:
                top_similarity = result[0]
                result = [res for res in result if res[0] != i][:top_k]
                result_normalize = [(label_list[res[0]],
                                     res[1] / top_similarity[1])
                                    for res in result]
                result_normalize = [
                    item for item_pair in result_normalize
                    for item in item_pair
                ]
                row += result_normalize
            out.loc[i, range(len(row))] = row

        out_dd = dd.from_pandas(out, npartitions=1)
        self.output().write_dask(out_dd, header=False, index=False)

        logger.info("-----------Finish calculating similarity--------------")