class ConvNeuralCluster(ExternalProgramTask): """Luigi task for training the model on cluster""" __version__ = "1.0" data_source = Parameter() output_pred = Parameter() output_model = Parameter() LOCAL_ROOT = os.path.join(os.getcwd(), "data") path = os.path.join( LOCAL_ROOT, "{task.__class__.__name__}-{salt}/{task.output_model}") output = TargetOutput(file_pattern=path, target_class=SuffixPreservingLocalTarget, ext="") def program_args(self): cluster_path = os.getenv("CLUSTER_PATH") cluster_pass = os.getenv("CLUSTER_PASS") return [ "sshpass", "-p", cluster_pass, "scp", cluster_path, self.temp_output_path, ] def run(self): with self.output().temporary_path() as self.temp_output_path: super().run()
class MockTargetOutputTask(ExternalTask): output = TargetOutput( target_class=CSVTarget, file_pattern=tmp + "/{task.__class__.__name__}", ext=".csv", glob="*.csv", )
class PlotResults(Task): """Luigi task that uses prediction and saves plots""" __version__ = "1.0" data_source = Parameter() output_pred = Parameter() output_model = Parameter() train_loc = Parameter() requires = Requires() req_1 = Requirement(ConvNeuralTest) LOCAL_ROOT = os.path.join(os.getcwd(), "data") path = os.path.join(LOCAL_ROOT, "{task.__class__.__name__}-{salt}.png") output = TargetOutput(file_pattern=path, target_class=SuffixPreservingLocalTarget, ext="") def run(self): """ Function that loads the prediction and call the show_cam plotting method """ features = np.load(self.req_1.output().path) test_path = self.req_1.output().path.rstrip("features.npy") results = np.load(test_path + "results.npy") gap_weights_l = np.load(test_path + "gap_weights_l.npy", allow_pickle=True) test_image = np.load(test_path + "image.npy") show_cam(gap_weights_l, results, features, test_image, self.output().path)
class download_slides(Task): """ Download raw slides from S3 to local. """ LOCAL_ROOT = 'raw_slides' course_name = Parameter('CSCI-E29') requires = Requires() data = Requirement(raw_slides) output = TargetOutput(file_pattern=os.path.join(LOCAL_ROOT,'{task.course_name}'), ext='') def run(self): logger = logging.getLogger('luigi-interface') logger.info("-----------Start downloading pdf's--------------") if isinstance(self.data.output(), S3Target): os.mkdir(self.output().path) for file in self.data.output().fs.list(self.data.output().path): if file: self.data.output().fs.get(s3_path=self.data.output().path+r'/'+file, destination_local_path=os.path.join(self.output().path, file)) else: copytree(self.data.output().path, self.output().path) logger.info("-----------Finish downloading pdf's--------------")
class parse_pdf_to_single_page(Task): """ Parse the downloaded slides to single page slide files. """ LOCAL_ROOT = r'pdf.js/static/slides' course_name = Parameter('CSCI-E29') requires = Requires() data = Requirement(download_slides) output = TargetOutput(file_pattern=os.path.join(LOCAL_ROOT,'{task.course_name}'), ext='') def run(self): logger = logging.getLogger('luigi-interface') logger.info("-----------Start parsing pdf's--------------") os.mkdir(self.output().path) raw_pdf_file = sorted(glob.glob(os.path.join(self.data.output().path, '*.pdf'))) for fn in raw_pdf_file: logger.info(fn) inputpdf = PdfFileReader(open(fn, "rb")) folder_name = os.path.join(self.output().path, os.path.basename(fn).replace('.pdf', '')) os.mkdir(folder_name) for i in range(inputpdf.numPages): output = PdfFileWriter() output.addPage(inputpdf.getPage(i)) file_name = os.path.join(folder_name, "{course_name}----{folder}----slide{slide_num}.pdf".format( course_name=self.course_name, folder=os.path.basename(fn).replace('.pdf', ''), slide_num=str(i))) with open(file_name, "wb") as outputStream: output.write(outputStream) logger.info("-----------Finish parsing pdf's--------------")
class GenerateFaqJsonFromHtml(BaseGenerateTask): requires = Requires() downloadtmltemplate = Requirement(DownloadHTMLTemplate) output = TargetOutput(file_pattern="data/cdcfaq.json", ext="", target_class=LocalTarget) def run(self): # Use self.output() and self.input() targets to atomically copy # the file locally! with self.downloadtmltemplate.output().open('r') as inf: parser = etree.HTMLParser() tree = etree.parse(inf, parser) r = tree.xpath('//span[@role="heading"]/../../../..') lst = [] for v in r: hdr = v.xpath('div/div/button/span[@role="heading"]/text()') bdy = v.xpath('div/div/div[@class="card-body"]/p/text()') q = " ".join(hdr) if len(bdy) == 0: bdy = v.xpath( 'div/div/div[@class="card-body"]/ul/li/text()') tmpAns = " ".join(bdy) a = tmpAns[:970] if (len(tmpAns) > 1000) else tmpAns lst.append({"q": q, "a": a}) lst = lst[0:self.number] with self.output().open('w') as outf: json.dump(lst, outf, indent=2)
class GenerateExcel(BaseGenerateTask): requires = Requires() geneeratejson = Requirement(GenerateFaqJsonFromHtml) output = TargetOutput(file_pattern="data/cdcfaq.csv", ext="", target_class=LocalTarget) def run(self): # Use self.output() and self.input() targets to atomically copy # the file locally! with self.geneeratejson.output().open('r') as inf: data = [] qas = json.load(inf) for qa_num, qa in enumerate(qas): answer = qa["a"] q = qa["q"] q = q.replace(",", " ") if not answer.strip(): continue if not q.strip(): continue data.append({"question": q, "answer": answer}) df = pd.DataFrame(data) with self.output().open('w') as outf: df.to_csv(outf, index=False, compression='gzip')
class LocalImageReduced(Task): """Luigi external task that returns a target for a small subset of train data""" __version__ = "1.0" requires = Requires() req_1 = Requirement(LocalImage) LOCAL_ROOT = os.path.join(os.getcwd(), "data") LOCAL_IMAGE = os.path.join(LOCAL_ROOT, "OCTReduced") output = TargetOutput( file_pattern=LOCAL_IMAGE, target_class=SuffixPreservingLocalTarget, ext="" ) def run(self): """ This function goes through the train/test directories and the subdirectories inside for each class and takes a small sample of images and copies them into a new directory. """ rootdir = self.req_1.output().path newpath = self.output().path for src_dir, dirs, files in os.walk(rootdir): dst_dir = src_dir.replace(rootdir, newpath, 1) if not os.path.exists(dst_dir): os.makedirs(dst_dir) counter = 0 for file_ in files: src_file = os.path.join(src_dir, file_) shutil.copy(src_file, dst_dir) counter += 1 if "train" in dst_dir and counter > 50: break elif "test" in dst_dir and counter > 10: break
class MockTargetOutputTask(ExternalTask): output = TargetOutput( target_class=CSVTarget, file_pattern=tmp, ext="", flag=None, glob="*.csv", )
class raw_slides(ExternalTask): """ Touch raw slides saved on S3. """ SLIDES_ROOT = r's3://cznybj/raw_slides' course_name = Parameter('CSCI-E29') output = TargetOutput(file_pattern=SLIDES_ROOT+r'/{task.course_name}', ext='', target_class=S3Target)
class MockDownloadCSV(DownloadCSV): file_url = filepath target_path = file_target_path output = TargetOutput( file_pattern=file_target_path, ext="", target_class=CSVTarget, flag=False, )
class ContentHtml(BaseContent): CONTENT_CDC_HTML = "cdcfaq.htm" output = TargetOutput( file_pattern="s3://covid-bot-jt/cdcfaq.htm", ext="", target_class=S3Target, format=format.Nop )
class ContentBotTemplate(BaseContent): CONTENT_CDC_HTML = "Covidbot_template.json" output = TargetOutput( file_pattern="s3://covid-bot-jt/Covidbot_template.json", ext="", target_class=S3Target, format=format.Nop )
class SomeTask(Task): output = TargetOutput(file_pattern=outputfilename, ext=".csv") def run(self): x = TargetOutput print(x.__call__(self)) if self.output().path == outputfilename + ".csv": return True else: raise Exception
class YelpReviews(ExternalTask): __version__ = "1.0.0" DATA_ROOT = "s3://pset5yelpreviews/" # if len(sys.argv) > 1 and sys.argv[1] == "--full": # files = "*.csv" # else: # files = "yelp_subset_0.csv" files = "*.csv" output = TargetOutput( file_pattern=DATA_ROOT, ext=".parquet", target_class=CSVTarget, flag=None, glob=files, storage_options=dict(requester_pays=True), )
class DownloadHTMLTemplate(BaseContent): requires = Requires() contenthtmltemplate = Requirement(ContentHtml) output = TargetOutput( file_pattern="data/cdcfaq.htm", ext="", target_class=LocalTarget, format=format.Nop ) def run(self): # Use self.output() and self.input() targets to atomically copy # the file locally! with self.contenthtmltemplate.output().open('r') as inf, self.output().open('w') as outf: outf.write(inf.read())
class DownloadBotTemplate(BaseContent): requires = Requires() contentbottemplate = Requirement(ContentBotTemplate) output = TargetOutput( file_pattern="data/templates/Covidbot_template.json", ext="", target_class=LocalTarget, format=format.Nop ) def run(self): # Use self.output() and self.input() targets to atomically copy # the file locally! with self.contentbottemplate.output().open('r') as inf, self.output().open('w') as outf: outf.write(inf.read())
class ConvNeuralTrain(ConvNeural): """Luigi task for training the model""" __version__ = "1.0" action = "train" def requires(self): return self.clone(LocalImageReduced) path = os.path.join( ConvNeural.LOCAL_ROOT, "{task.__class__.__name__}-{salt}/{task.output_model}") output = TargetOutput(file_pattern=path, target_class=SuffixPreservingLocalTarget, ext="")
class ConvNeuralTest(ConvNeural): """Luigi task that activates ML venv through its parent class and does the model testing""" __version__ = "1.0" action = "test" train_loc = Parameter() def requires(self): if self.train_loc == "cluster": return self.clone(ConvNeuralCluster) else: return self.clone(ConvNeuralTrain) path = os.path.join( ConvNeural.LOCAL_ROOT, "{task.__class__.__name__}-{salt}/{task.output_pred}.npy") output = TargetOutput(file_pattern=path, target_class=SuffixPreservingLocalTarget, ext="")
class DownloadCSV(ExternalTask): """Luigi Task to download Covid data csv file from OWID and save the results as a dask collection.""" file_url = "https://covid.ourworldindata.org/data/owid-covid-data.csv" parent_directory = Path(os.path.dirname( os.path.realpath(__file__))).parent.parent target_path = "data/covid_data" local_directory = os.path.join(str(parent_directory), "data/covid_data/") output = TargetOutput(file_pattern=local_directory, ext="", target_class=CSVTarget, flag=False) def run(self): """Writes a set of CSV files to data/covid_data folder.""" data = pd.read_csv(self.file_url) ddf = dd.from_pandas(data, chunksize=5000) self.output().write_dask(collection=ddf, filename=self.target_path)
class extract_text(Task): """ Extract the text information from each page of slides """ LOCAL_ROOT = r'slides' course_name = Parameter('CSCI-E29') requires = Requires() data = Requirement(parse_pdf_to_single_page) output = TargetOutput(file_pattern=LOCAL_ROOT + '_{task.course_name}', ext='') def run(self): logger = logging.getLogger('luigi-interface') logger.info( "-----------Start extracting text from pdf's--------------") os.mkdir(self.output().path) single_pdf_file = sorted( glob.glob(os.path.join(self.data.output().path, r'**/*.pdf'))) content = [] label = [] for fn in single_pdf_file: if fn.endswith('slide1.pdf'): logger.info(fn.rsplit(r'----', 1)[0]) text = textract.process(fn) text = text.decode("utf-8") content.append(tokenizer(text)) label.append( os.path.basename(fn).replace('----', '##').replace('.pdf', '') + '\n') logger.info( "-----------Finish extracting text from pdf's--------------") with atomic_write( os.path.join(self.output().path, '{}.dat'.format(self.output().path))) as fn: fn.writelines(content) with atomic_write( os.path.join(self.output().path, '{}.dat.labels'.format( self.output().path))) as fn: fn.writelines(label)
class CleanedReviews(Task): __version__ = "1.0.0" subset = BoolParameter(default=True) requires = Requires() task2 = Requirement(YelpReviews) parquet_data = "./yelpdata/" output = TargetOutput(file_pattern=parquet_data, ext="", target_class=ParquetTarget) def run(self): df = self.input()["task2"].read_dask(check_complete=True) df = df[(df.user_id.notnull()) & (df.review_id.str.len() == 22)] values = {"funny": 0, "cool": 0, "useful": 0, "stars": 0} df = df.fillna(value=values) df = df.astype({"funny": int, "cool": int, "useful": int, "stars": int}) self.output().write_dask(collection=df, compression="gzip")
class GenerateBot(BaseGenerateTask): requires = Requires() geneeratejson = Requirement(GenerateFaqJsonFromHtml) downloadbottemplate = Requirement(DownloadBotTemplate) COVID_INTENTBASE = "covid" output = TargetOutput(file_pattern="data/bot/cdcfaqbot.json", ext="", target_class=LocalTarget) def run(self): # Use self.output() and self.input() targets to atomically copy # the file locally! with self.geneeratejson.output().open('r') as inf: qas = json.load(inf) with self.downloadbottemplate.output().open('r') as template: covidbot_templates = json.load(template) covidbotresource = covidbot_templates covid_intents = covidbotresource["resource"]["intents"] for qa_num, qa in enumerate(qas): answer = qa["a"] q = qa["q"] q = q.replace(",", " ") if not answer.strip(): continue if not q.strip(): continue intent_name = f"{self.COVID_INTENTBASE}{num_to_char(qa_num)}" lex_template = json.loads(INTENT_JSON) resource = lex_template resource["name"] = intent_name resource["sampleUtterances"] = gen_sample_utterances(q, qa) resource["conclusionStatement"]["messages"][0][ "content"] = answer covid_intents.append(resource) with self.output().open('w') as outf: json.dump(covidbot_templates, outf, indent=2)
class CleanedHeadlines(Task): ''' This class loads the data from the AWS instance if it exists and preprocesses the data for analysis The class returns a dataframe with pre-processed reviews that can be loaded in the Topic Modeling class for additional analysis :input : s3 Path to Article Headlines :output : Creates a Local Parquet File with the preprocessed data ''' subset = BoolParameter(default=True) requires = Requires() article_headlines = Requirement(ArticleHeadlines) date = datetime.datetime.now() date_suffix = str(date.month) + '_' + str(date.day) + '_' + str(date.year) output = TargetOutput( target_class=ParquetTarget, ext='-' + date_suffix, glob="*.parquet", ) def run(self): dsk = self.input()['article_headlines'].read_dask( dtype={ "publish_date": "int32", "headline_text": "str", "headline_id": "str" }, storage_options=dict(requester_pays=True), ) # dsk_df = dsk.compute() headlines_concat = "".join(dsk["headline_id"]) headlines_hash = hash_str(headlines_concat, get_csci_salt()).hex()[:8] self.output().write_dask(dsk, compression="gzip") def print_results(self): print(self.output().read_dask().compute())
class BySomething(Task): __version__ = "1.0.0" requires = Requires() task3 = Requirement(CleanedReviews) # Be sure to read from CleanedReviews locally output = TargetOutput( file_pattern="./yelpdata/", ext=".parquet", target_class=ParquetTarget, flag=None, glob="*.parquet", storage_options=dict(requester_pays=True), ) def run(self): df = self.input()["task3"].read_dask( check_complete=True, column=["year", "stars", "text"] ) df["decade"] = (df["date"].dt.year // 10) % 10 df["text_length"] = df["text"].str.len() by_decade = df[["decade", "text_length"]].groupby("decade").mean().round() by_star = df[["stars", "text_length"]].groupby("stars").mean().round() df.to_csv("./yelpdata/YelpReviewCleansed/*.csv", index=False, sep=',', encoding='utf-8') self.by_decade = by_decade self.by_star = by_star print(by_decade.compute()) print(by_star.compute()) def print_by_decade(self): print(self.by_decade.compute()) def print_by_star(self): print(self.by_decade.compute())
class similarity_calc(Task): """ Calculate similarity based on BM 25 model, and return the top 10 most related pages of each slide. """ LOCAL_ROOT = r'static/ranking_results' course_name = Parameter('CSCI-E29') requires = Requires() data = Requirement(extract_text) output = TargetOutput( file_pattern=os.path.join(LOCAL_ROOT, '{task.course_name}') + r'/', ext='', target_class=CSVTarget) def run(self): logger = logging.getLogger('luigi-interface') logger.info("-----------Start calculating similarity--------------") cfg_course_path = os.path.join(self.data.output().path, 'config.toml') if not os.path.exists(cfg_course_path): cfg_course = toml.load('config.toml') cfg_course['dataset'] += '_' + self.course_name cfg_course['index'] += '_' + self.course_name with atomic_write(cfg_course_path, 'w') as f: f.write(toml.dumps(cfg_course)) idx = metapy.index.make_inverted_index(cfg_course_path) ranker = metapy.index.OkapiBM25() top_k = 10 query = metapy.index.Document() with open( os.path.join(self.data.output().path, '{}.dat.labels'.format(self.data.output().path)), 'r') as fn: label_list = fn.read().splitlines() with open( os.path.join(self.data.output().path, '{}.dat'.format(self.data.output().path)), 'r') as fn: txt_list = fn.read().splitlines() out = pd.DataFrame(columns=range(21)) for i in range(len(label_list)): if not i % 10: logger.info('processing---{}/{}'.format( str(i), len(label_list))) row = [label_list[i]] query.content(txt_list[i]) result = ranker.score(idx, query, top_k + 1) if len(result) > 1: top_similarity = result[0] result = [res for res in result if res[0] != i][:top_k] result_normalize = [(label_list[res[0]], res[1] / top_similarity[1]) for res in result] result_normalize = [ item for item_pair in result_normalize for item in item_pair ] row += result_normalize out.loc[i, range(len(row))] = row out_dd = dd.from_pandas(out, npartitions=1) self.output().write_dask(out_dd, header=False, index=False) logger.info("-----------Finish calculating similarity--------------")
class GetDeliveryTimes(Task): requires = Requires() main_page = Requirement(InfoModalDeliveryTimes) # get_time_window() makes sure we don't run this more than once every 5 minutes output = TargetOutput( file_pattern=os.path.join( "data", "delivery_times_{}".format(get_time_window()), ""), ext=".parquet", target_class=ParquetTarget, ) def run(self): # Commented out because it significantly increases run time # self.detect_load_more_times_button() if self.detect_no_deliveries(): self.output().write_dask( dd.from_pandas(pd.DataFrame([]), chunksize=1)) else: self.output().write_dask( dd.from_pandas(self.detect_delivery_times(), chunksize=1)) def detect_delivery_times(self): header = find_by_text( get_browser(merchant=MERCHANT_NAME), "Available Scheduled Times", )[0] section = get_parent(get_parent(header)) return pd.DataFrame(self.parse_delivery_times(section.text)) def parse_delivery_times(self, text): delivery_times = [] for line in text.splitlines(): if is_date(line): date = line if is_time(line): delivery_times.append({"date": date, "time": line}) if is_money(line): delivery_times[-1]["price"] = line return delivery_times def detect_load_more_times_button(self): while True: try: button = get_browser( merchant=MERCHANT_NAME).find_element_by_xpath( '//button[text()="More times"]') button.click() sleep(5) except NoSuchElementException: return def detect_no_deliveries(self): try: get_browser(merchant=MERCHANT_NAME).find_element_by_css_selector( 'img[alt="All delivery windows are full"]') return True except: return False def print_results(self): print(self.get_results()) def get_results(self): return self.output().read_dask().compute()