class ConvNeuralCluster(ExternalProgramTask):
    """Luigi task for training the model on cluster"""

    __version__ = "1.0"

    data_source = Parameter()
    output_pred = Parameter()
    output_model = Parameter()

    LOCAL_ROOT = os.path.join(os.getcwd(), "data")

    path = os.path.join(
        LOCAL_ROOT, "{task.__class__.__name__}-{salt}/{task.output_model}")

    output = TargetOutput(file_pattern=path,
                          target_class=SuffixPreservingLocalTarget,
                          ext="")

    def program_args(self):

        cluster_path = os.getenv("CLUSTER_PATH")
        cluster_pass = os.getenv("CLUSTER_PASS")

        return [
            "sshpass",
            "-p",
            cluster_pass,
            "scp",
            cluster_path,
            self.temp_output_path,
        ]

    def run(self):
        with self.output().temporary_path() as self.temp_output_path:
            super().run()
示例#2
0
 class MockTargetOutputTask(ExternalTask):
     output = TargetOutput(
         target_class=CSVTarget,
         file_pattern=tmp + "/{task.__class__.__name__}",
         ext=".csv",
         glob="*.csv",
     )
class PlotResults(Task):
    """Luigi task that uses prediction and saves plots"""

    __version__ = "1.0"

    data_source = Parameter()
    output_pred = Parameter()
    output_model = Parameter()
    train_loc = Parameter()

    requires = Requires()
    req_1 = Requirement(ConvNeuralTest)
    LOCAL_ROOT = os.path.join(os.getcwd(), "data")

    path = os.path.join(LOCAL_ROOT, "{task.__class__.__name__}-{salt}.png")

    output = TargetOutput(file_pattern=path,
                          target_class=SuffixPreservingLocalTarget,
                          ext="")

    def run(self):
        """
        Function that loads the prediction and call the show_cam plotting method
        """
        features = np.load(self.req_1.output().path)
        test_path = self.req_1.output().path.rstrip("features.npy")
        results = np.load(test_path + "results.npy")
        gap_weights_l = np.load(test_path + "gap_weights_l.npy",
                                allow_pickle=True)
        test_image = np.load(test_path + "image.npy")

        show_cam(gap_weights_l, results, features, test_image,
                 self.output().path)
示例#4
0
class download_slides(Task):
    """
    Download raw slides from S3 to local.
    """
    LOCAL_ROOT = 'raw_slides'
    course_name = Parameter('CSCI-E29')

    requires = Requires()
    data = Requirement(raw_slides)

    output = TargetOutput(file_pattern=os.path.join(LOCAL_ROOT,'{task.course_name}'), ext='')

    def run(self):
        logger = logging.getLogger('luigi-interface')
        logger.info("-----------Start downloading pdf's--------------")

        if isinstance(self.data.output(), S3Target):
            os.mkdir(self.output().path)
            for file in self.data.output().fs.list(self.data.output().path):
                if file:
                    self.data.output().fs.get(s3_path=self.data.output().path+r'/'+file,
                                              destination_local_path=os.path.join(self.output().path, file))
        else:
            copytree(self.data.output().path, self.output().path)
        logger.info("-----------Finish downloading pdf's--------------")
示例#5
0
class parse_pdf_to_single_page(Task):
    """
    Parse the downloaded slides to single page slide files.
    """
    LOCAL_ROOT = r'pdf.js/static/slides'
    course_name = Parameter('CSCI-E29')
    requires = Requires()
    data = Requirement(download_slides)

    output = TargetOutput(file_pattern=os.path.join(LOCAL_ROOT,'{task.course_name}'), ext='')

    def run(self):
        logger = logging.getLogger('luigi-interface')
        logger.info("-----------Start parsing pdf's--------------")
        os.mkdir(self.output().path)
        raw_pdf_file = sorted(glob.glob(os.path.join(self.data.output().path, '*.pdf')))

        for fn in raw_pdf_file:
            logger.info(fn)
            inputpdf = PdfFileReader(open(fn, "rb"))
            folder_name = os.path.join(self.output().path, os.path.basename(fn).replace('.pdf', ''))
            os.mkdir(folder_name)
            for i in range(inputpdf.numPages):
                output = PdfFileWriter()
                output.addPage(inputpdf.getPage(i))
                file_name = os.path.join(folder_name, "{course_name}----{folder}----slide{slide_num}.pdf".format(
                    course_name=self.course_name, folder=os.path.basename(fn).replace('.pdf', ''), slide_num=str(i)))

                with open(file_name, "wb") as outputStream:
                    output.write(outputStream)

        logger.info("-----------Finish parsing pdf's--------------")
class GenerateFaqJsonFromHtml(BaseGenerateTask):
    requires = Requires()
    downloadtmltemplate = Requirement(DownloadHTMLTemplate)

    output = TargetOutput(file_pattern="data/cdcfaq.json",
                          ext="",
                          target_class=LocalTarget)

    def run(self):
        # Use self.output() and self.input() targets to atomically copy
        # the file locally!
        with self.downloadtmltemplate.output().open('r') as inf:
            parser = etree.HTMLParser()
            tree = etree.parse(inf, parser)

            r = tree.xpath('//span[@role="heading"]/../../../..')

            lst = []

            for v in r:
                hdr = v.xpath('div/div/button/span[@role="heading"]/text()')
                bdy = v.xpath('div/div/div[@class="card-body"]/p/text()')
                q = " ".join(hdr)

                if len(bdy) == 0:
                    bdy = v.xpath(
                        'div/div/div[@class="card-body"]/ul/li/text()')

                tmpAns = " ".join(bdy)
                a = tmpAns[:970] if (len(tmpAns) > 1000) else tmpAns

                lst.append({"q": q, "a": a})
            lst = lst[0:self.number]
            with self.output().open('w') as outf:
                json.dump(lst, outf, indent=2)
class GenerateExcel(BaseGenerateTask):
    requires = Requires()
    geneeratejson = Requirement(GenerateFaqJsonFromHtml)

    output = TargetOutput(file_pattern="data/cdcfaq.csv",
                          ext="",
                          target_class=LocalTarget)

    def run(self):
        # Use self.output() and self.input() targets to atomically copy
        # the file locally!
        with self.geneeratejson.output().open('r') as inf:
            data = []
            qas = json.load(inf)
            for qa_num, qa in enumerate(qas):
                answer = qa["a"]
                q = qa["q"]
                q = q.replace(",", " ")
                if not answer.strip():
                    continue
                if not q.strip():
                    continue
                data.append({"question": q, "answer": answer})

            df = pd.DataFrame(data)

            with self.output().open('w') as outf:
                df.to_csv(outf, index=False, compression='gzip')
class LocalImageReduced(Task):
    """Luigi external task that returns a target for a small subset of train data"""

    __version__ = "1.0"

    requires = Requires()
    req_1 = Requirement(LocalImage)
    LOCAL_ROOT = os.path.join(os.getcwd(), "data")
    LOCAL_IMAGE = os.path.join(LOCAL_ROOT, "OCTReduced")

    output = TargetOutput(
        file_pattern=LOCAL_IMAGE, target_class=SuffixPreservingLocalTarget, ext=""
    )

    def run(self):
        """
        This function goes through the train/test directories and the subdirectories inside for each class and takes
        a small sample of images and copies them into a new directory.
        """
        rootdir = self.req_1.output().path
        newpath = self.output().path
        for src_dir, dirs, files in os.walk(rootdir):
            dst_dir = src_dir.replace(rootdir, newpath, 1)
            if not os.path.exists(dst_dir):
                os.makedirs(dst_dir)
            counter = 0
            for file_ in files:
                src_file = os.path.join(src_dir, file_)
                shutil.copy(src_file, dst_dir)
                counter += 1
                if "train" in dst_dir and counter > 50:
                    break
                elif "test" in dst_dir and counter > 10:
                    break
示例#9
0
 class MockTargetOutputTask(ExternalTask):
     output = TargetOutput(
         target_class=CSVTarget,
         file_pattern=tmp,
         ext="",
         flag=None,
         glob="*.csv",
     )
示例#10
0
class raw_slides(ExternalTask):
    """
    Touch raw slides saved on S3.
    """
    SLIDES_ROOT = r's3://cznybj/raw_slides'
    course_name = Parameter('CSCI-E29')

    output = TargetOutput(file_pattern=SLIDES_ROOT+r'/{task.course_name}', ext='', target_class=S3Target)
 class MockDownloadCSV(DownloadCSV):
     file_url = filepath
     target_path = file_target_path
     output = TargetOutput(
         file_pattern=file_target_path,
         ext="",
         target_class=CSVTarget,
         flag=False,
     )
class ContentHtml(BaseContent):
    CONTENT_CDC_HTML = "cdcfaq.htm"

    output = TargetOutput(
        file_pattern="s3://covid-bot-jt/cdcfaq.htm",
        ext="",
        target_class=S3Target,
        format=format.Nop
    )
class ContentBotTemplate(BaseContent):
    CONTENT_CDC_HTML = "Covidbot_template.json"

    output = TargetOutput(
        file_pattern="s3://covid-bot-jt/Covidbot_template.json",
        ext="",
        target_class=S3Target,
        format=format.Nop
    )
            class SomeTask(Task):
                output = TargetOutput(file_pattern=outputfilename, ext=".csv")

                def run(self):
                    x = TargetOutput
                    print(x.__call__(self))
                    if self.output().path == outputfilename + ".csv":
                        return True
                    else:
                        raise Exception
示例#15
0
class YelpReviews(ExternalTask):
    __version__ = "1.0.0"
    DATA_ROOT = "s3://pset5yelpreviews/"
    # if len(sys.argv) > 1 and sys.argv[1] == "--full":
    #     files = "*.csv"
    # else:
    #     files = "yelp_subset_0.csv"
    files = "*.csv"
    output = TargetOutput(
        file_pattern=DATA_ROOT,
        ext=".parquet",
        target_class=CSVTarget,
        flag=None,
        glob=files,
        storage_options=dict(requester_pays=True),
    )
class DownloadHTMLTemplate(BaseContent):
    requires = Requires()
    contenthtmltemplate = Requirement(ContentHtml)

    output = TargetOutput(
        file_pattern="data/cdcfaq.htm",
        ext="",
        target_class=LocalTarget,
        format=format.Nop
    )

    def run(self):
        # Use self.output() and self.input() targets to atomically copy
        # the file locally!
        with self.contenthtmltemplate.output().open('r') as inf, self.output().open('w') as outf:
            outf.write(inf.read())
class DownloadBotTemplate(BaseContent):
    requires = Requires()
    contentbottemplate = Requirement(ContentBotTemplate)

    output = TargetOutput(
        file_pattern="data/templates/Covidbot_template.json",
        ext="",
        target_class=LocalTarget,
        format=format.Nop
    )

    def run(self):
        # Use self.output() and self.input() targets to atomically copy
        # the file locally!
        with self.contentbottemplate.output().open('r') as inf, self.output().open('w') as outf:
            outf.write(inf.read())
class ConvNeuralTrain(ConvNeural):
    """Luigi task for training the model"""

    __version__ = "1.0"

    action = "train"

    def requires(self):
        return self.clone(LocalImageReduced)

    path = os.path.join(
        ConvNeural.LOCAL_ROOT,
        "{task.__class__.__name__}-{salt}/{task.output_model}")

    output = TargetOutput(file_pattern=path,
                          target_class=SuffixPreservingLocalTarget,
                          ext="")
class ConvNeuralTest(ConvNeural):
    """Luigi task that activates ML venv through its parent class and does the model testing"""

    __version__ = "1.0"
    action = "test"
    train_loc = Parameter()

    def requires(self):
        if self.train_loc == "cluster":
            return self.clone(ConvNeuralCluster)
        else:
            return self.clone(ConvNeuralTrain)

    path = os.path.join(
        ConvNeural.LOCAL_ROOT,
        "{task.__class__.__name__}-{salt}/{task.output_pred}.npy")
    output = TargetOutput(file_pattern=path,
                          target_class=SuffixPreservingLocalTarget,
                          ext="")
示例#20
0
class DownloadCSV(ExternalTask):
    """Luigi Task to download Covid data csv file from OWID and save the results as a dask collection."""

    file_url = "https://covid.ourworldindata.org/data/owid-covid-data.csv"
    parent_directory = Path(os.path.dirname(
        os.path.realpath(__file__))).parent.parent
    target_path = "data/covid_data"

    local_directory = os.path.join(str(parent_directory), "data/covid_data/")
    output = TargetOutput(file_pattern=local_directory,
                          ext="",
                          target_class=CSVTarget,
                          flag=False)

    def run(self):
        """Writes a set of CSV files to data/covid_data folder."""
        data = pd.read_csv(self.file_url)
        ddf = dd.from_pandas(data, chunksize=5000)
        self.output().write_dask(collection=ddf, filename=self.target_path)
示例#21
0
class extract_text(Task):
    """
    Extract the text information from each page of slides
    """
    LOCAL_ROOT = r'slides'
    course_name = Parameter('CSCI-E29')
    requires = Requires()
    data = Requirement(parse_pdf_to_single_page)

    output = TargetOutput(file_pattern=LOCAL_ROOT + '_{task.course_name}',
                          ext='')

    def run(self):
        logger = logging.getLogger('luigi-interface')
        logger.info(
            "-----------Start extracting text from pdf's--------------")

        os.mkdir(self.output().path)
        single_pdf_file = sorted(
            glob.glob(os.path.join(self.data.output().path, r'**/*.pdf')))
        content = []
        label = []
        for fn in single_pdf_file:
            if fn.endswith('slide1.pdf'):
                logger.info(fn.rsplit(r'----', 1)[0])
            text = textract.process(fn)
            text = text.decode("utf-8")
            content.append(tokenizer(text))
            label.append(
                os.path.basename(fn).replace('----', '##').replace('.pdf', '')
                + '\n')

        logger.info(
            "-----------Finish extracting text from pdf's--------------")
        with atomic_write(
                os.path.join(self.output().path,
                             '{}.dat'.format(self.output().path))) as fn:
            fn.writelines(content)
        with atomic_write(
                os.path.join(self.output().path, '{}.dat.labels'.format(
                    self.output().path))) as fn:
            fn.writelines(label)
示例#22
0
class CleanedReviews(Task):
    __version__ = "1.0.0"
    subset = BoolParameter(default=True)
    requires = Requires()
    task2 = Requirement(YelpReviews)
    parquet_data = "./yelpdata/"

    output = TargetOutput(file_pattern=parquet_data, ext="", target_class=ParquetTarget)

    def run(self):

        df = self.input()["task2"].read_dask(check_complete=True)


        df = df[(df.user_id.notnull()) & (df.review_id.str.len() == 22)]
        values = {"funny": 0, "cool": 0, "useful": 0, "stars": 0}
        df = df.fillna(value=values)
        df = df.astype({"funny": int, "cool": int, "useful": int, "stars": int})

        self.output().write_dask(collection=df, compression="gzip")
class GenerateBot(BaseGenerateTask):
    requires = Requires()
    geneeratejson = Requirement(GenerateFaqJsonFromHtml)
    downloadbottemplate = Requirement(DownloadBotTemplate)
    COVID_INTENTBASE = "covid"

    output = TargetOutput(file_pattern="data/bot/cdcfaqbot.json",
                          ext="",
                          target_class=LocalTarget)

    def run(self):
        # Use self.output() and self.input() targets to atomically copy
        # the file locally!
        with self.geneeratejson.output().open('r') as inf:
            qas = json.load(inf)
            with self.downloadbottemplate.output().open('r') as template:
                covidbot_templates = json.load(template)

            covidbotresource = covidbot_templates
            covid_intents = covidbotresource["resource"]["intents"]
            for qa_num, qa in enumerate(qas):
                answer = qa["a"]
                q = qa["q"]
                q = q.replace(",", " ")
                if not answer.strip():
                    continue
                if not q.strip():
                    continue

                intent_name = f"{self.COVID_INTENTBASE}{num_to_char(qa_num)}"
                lex_template = json.loads(INTENT_JSON)
                resource = lex_template
                resource["name"] = intent_name
                resource["sampleUtterances"] = gen_sample_utterances(q, qa)
                resource["conclusionStatement"]["messages"][0][
                    "content"] = answer
                covid_intents.append(resource)

            with self.output().open('w') as outf:
                json.dump(covidbot_templates, outf, indent=2)
示例#24
0
class CleanedHeadlines(Task):
    '''
    This class loads the data from the AWS instance if it exists and preprocesses the data for analysis 
    The class returns a dataframe with pre-processed reviews that can be loaded in the Topic Modeling class 
    for additional analysis 

    :input : s3 Path to Article Headlines 
    :output : Creates a Local Parquet File with the preprocessed data 
    '''
    subset = BoolParameter(default=True)
    requires = Requires()
    article_headlines = Requirement(ArticleHeadlines)
    date = datetime.datetime.now()
    date_suffix = str(date.month) + '_' + str(date.day) + '_' + str(date.year)

    output = TargetOutput(
        target_class=ParquetTarget,
        ext='-' + date_suffix,
        glob="*.parquet",
    )

    def run(self):
        dsk = self.input()['article_headlines'].read_dask(
            dtype={
                "publish_date": "int32",
                "headline_text": "str",
                "headline_id": "str"
            },
            storage_options=dict(requester_pays=True),
        )

        # dsk_df = dsk.compute()
        headlines_concat = "".join(dsk["headline_id"])
        headlines_hash = hash_str(headlines_concat, get_csci_salt()).hex()[:8]
        self.output().write_dask(dsk, compression="gzip")

    def print_results(self):
        print(self.output().read_dask().compute())
示例#25
0
class BySomething(Task):
    __version__ = "1.0.0"

    requires = Requires()
    task3 = Requirement(CleanedReviews)
    # Be sure to read from CleanedReviews locally

    output = TargetOutput(
        file_pattern="./yelpdata/",
        ext=".parquet",
        target_class=ParquetTarget,
        flag=None,
        glob="*.parquet",
        storage_options=dict(requester_pays=True),
    )

    def run(self):
        df = self.input()["task3"].read_dask(
            check_complete=True, column=["year", "stars", "text"]
        )
        df["decade"] = (df["date"].dt.year // 10) % 10
        df["text_length"] = df["text"].str.len()
        by_decade = df[["decade", "text_length"]].groupby("decade").mean().round()
        by_star = df[["stars", "text_length"]].groupby("stars").mean().round()

        df.to_csv("./yelpdata/YelpReviewCleansed/*.csv", index=False, sep=',', encoding='utf-8')

        self.by_decade = by_decade
        self.by_star = by_star
        print(by_decade.compute())
        print(by_star.compute())

    def print_by_decade(self):
        print(self.by_decade.compute())

    def print_by_star(self):
        print(self.by_decade.compute())
示例#26
0
class similarity_calc(Task):
    """
    Calculate similarity based on BM 25 model, and return the top 10 most related pages of each slide.
    """
    LOCAL_ROOT = r'static/ranking_results'
    course_name = Parameter('CSCI-E29')
    requires = Requires()
    data = Requirement(extract_text)

    output = TargetOutput(
        file_pattern=os.path.join(LOCAL_ROOT, '{task.course_name}') + r'/',
        ext='',
        target_class=CSVTarget)

    def run(self):
        logger = logging.getLogger('luigi-interface')
        logger.info("-----------Start calculating similarity--------------")
        cfg_course_path = os.path.join(self.data.output().path, 'config.toml')
        if not os.path.exists(cfg_course_path):
            cfg_course = toml.load('config.toml')
            cfg_course['dataset'] += '_' + self.course_name
            cfg_course['index'] += '_' + self.course_name
            with atomic_write(cfg_course_path, 'w') as f:
                f.write(toml.dumps(cfg_course))

        idx = metapy.index.make_inverted_index(cfg_course_path)
        ranker = metapy.index.OkapiBM25()
        top_k = 10
        query = metapy.index.Document()
        with open(
                os.path.join(self.data.output().path,
                             '{}.dat.labels'.format(self.data.output().path)),
                'r') as fn:
            label_list = fn.read().splitlines()
        with open(
                os.path.join(self.data.output().path,
                             '{}.dat'.format(self.data.output().path)),
                'r') as fn:
            txt_list = fn.read().splitlines()
        out = pd.DataFrame(columns=range(21))
        for i in range(len(label_list)):
            if not i % 10:
                logger.info('processing---{}/{}'.format(
                    str(i), len(label_list)))
            row = [label_list[i]]
            query.content(txt_list[i])
            result = ranker.score(idx, query, top_k + 1)
            if len(result) > 1:
                top_similarity = result[0]
                result = [res for res in result if res[0] != i][:top_k]
                result_normalize = [(label_list[res[0]],
                                     res[1] / top_similarity[1])
                                    for res in result]
                result_normalize = [
                    item for item_pair in result_normalize
                    for item in item_pair
                ]
                row += result_normalize
            out.loc[i, range(len(row))] = row

        out_dd = dd.from_pandas(out, npartitions=1)
        self.output().write_dask(out_dd, header=False, index=False)

        logger.info("-----------Finish calculating similarity--------------")
示例#27
0
class GetDeliveryTimes(Task):
    requires = Requires()
    main_page = Requirement(InfoModalDeliveryTimes)

    # get_time_window() makes sure we don't run this more than once every 5 minutes
    output = TargetOutput(
        file_pattern=os.path.join(
            "data", "delivery_times_{}".format(get_time_window()), ""),
        ext=".parquet",
        target_class=ParquetTarget,
    )

    def run(self):
        # Commented out because it significantly increases run time
        # self.detect_load_more_times_button()
        if self.detect_no_deliveries():
            self.output().write_dask(
                dd.from_pandas(pd.DataFrame([]), chunksize=1))
        else:
            self.output().write_dask(
                dd.from_pandas(self.detect_delivery_times(), chunksize=1))

    def detect_delivery_times(self):
        header = find_by_text(
            get_browser(merchant=MERCHANT_NAME),
            "Available Scheduled Times",
        )[0]
        section = get_parent(get_parent(header))
        return pd.DataFrame(self.parse_delivery_times(section.text))

    def parse_delivery_times(self, text):
        delivery_times = []
        for line in text.splitlines():
            if is_date(line):
                date = line
            if is_time(line):
                delivery_times.append({"date": date, "time": line})
            if is_money(line):
                delivery_times[-1]["price"] = line
        return delivery_times

    def detect_load_more_times_button(self):
        while True:
            try:
                button = get_browser(
                    merchant=MERCHANT_NAME).find_element_by_xpath(
                        '//button[text()="More times"]')
                button.click()
                sleep(5)
            except NoSuchElementException:
                return

    def detect_no_deliveries(self):
        try:
            get_browser(merchant=MERCHANT_NAME).find_element_by_css_selector(
                'img[alt="All delivery windows are full"]')
            return True
        except:
            return False

    def print_results(self):
        print(self.get_results())

    def get_results(self):
        return self.output().read_dask().compute()