class FetchBuildDetails(BaseBDroneTask): requires = Requires() builds = Requirement(FetchBuilds) timestamp = Parameter() @delayed def fetch_build_details(self, repo, build_id, repo_id): resp_json = self.get_result(f"{repo}/builds/{build_id}") df = pd.DataFrame(resp_json, columns=[*columns, 'stages']).fillna('None') df['repo_id'] = repo_id return ddf.from_pandas(df, npartitions=1) def __init__(self, timestamp): super(FetchBuildDetails, self).__init__(timestamp) self.output_path = os.path.join(get_base_output_path(), timestamp, "details") def run(self): print("########## Fetching Build Details ############") df = self.input()["builds"].read_dask() fetches = list(df .apply(lambda r: self.fetch_build_details(r["repo_name"], r["number"], r["repo_id"]), axis=1) .compute()) dfs = ddf.concat([*compute(*fetches)], axis=0)\ .set_index('number')\ .map_partitions(lambda x: x.sort_index()) self.output().write_dask(dfs) def output(self): return CSVTarget( path=self.output_path + os.path.sep, glob="*.part" )
class download_slides(Task): """ Download raw slides from S3 to local. """ LOCAL_ROOT = 'raw_slides' course_name = Parameter('CSCI-E29') requires = Requires() data = Requirement(raw_slides) output = TargetOutput(file_pattern=os.path.join(LOCAL_ROOT,'{task.course_name}'), ext='') def run(self): logger = logging.getLogger('luigi-interface') logger.info("-----------Start downloading pdf's--------------") if isinstance(self.data.output(), S3Target): os.mkdir(self.output().path) for file in self.data.output().fs.list(self.data.output().path): if file: self.data.output().fs.get(s3_path=self.data.output().path+r'/'+file, destination_local_path=os.path.join(self.output().path, file)) else: copytree(self.data.output().path, self.output().path) logger.info("-----------Finish downloading pdf's--------------")
class parse_pdf_to_single_page(Task): """ Parse the downloaded slides to single page slide files. """ LOCAL_ROOT = r'pdf.js/static/slides' course_name = Parameter('CSCI-E29') requires = Requires() data = Requirement(download_slides) output = TargetOutput(file_pattern=os.path.join(LOCAL_ROOT,'{task.course_name}'), ext='') def run(self): logger = logging.getLogger('luigi-interface') logger.info("-----------Start parsing pdf's--------------") os.mkdir(self.output().path) raw_pdf_file = sorted(glob.glob(os.path.join(self.data.output().path, '*.pdf'))) for fn in raw_pdf_file: logger.info(fn) inputpdf = PdfFileReader(open(fn, "rb")) folder_name = os.path.join(self.output().path, os.path.basename(fn).replace('.pdf', '')) os.mkdir(folder_name) for i in range(inputpdf.numPages): output = PdfFileWriter() output.addPage(inputpdf.getPage(i)) file_name = os.path.join(folder_name, "{course_name}----{folder}----slide{slide_num}.pdf".format( course_name=self.course_name, folder=os.path.basename(fn).replace('.pdf', ''), slide_num=str(i))) with open(file_name, "wb") as outputStream: output.write(outputStream) logger.info("-----------Finish parsing pdf's--------------")
class GenerateFaqJsonFromHtml(BaseGenerateTask): requires = Requires() downloadtmltemplate = Requirement(DownloadHTMLTemplate) output = TargetOutput(file_pattern="data/cdcfaq.json", ext="", target_class=LocalTarget) def run(self): # Use self.output() and self.input() targets to atomically copy # the file locally! with self.downloadtmltemplate.output().open('r') as inf: parser = etree.HTMLParser() tree = etree.parse(inf, parser) r = tree.xpath('//span[@role="heading"]/../../../..') lst = [] for v in r: hdr = v.xpath('div/div/button/span[@role="heading"]/text()') bdy = v.xpath('div/div/div[@class="card-body"]/p/text()') q = " ".join(hdr) if len(bdy) == 0: bdy = v.xpath( 'div/div/div[@class="card-body"]/ul/li/text()') tmpAns = " ".join(bdy) a = tmpAns[:970] if (len(tmpAns) > 1000) else tmpAns lst.append({"q": q, "a": a}) lst = lst[0:self.number] with self.output().open('w') as outf: json.dump(lst, outf, indent=2)
class GenerateExcel(BaseGenerateTask): requires = Requires() geneeratejson = Requirement(GenerateFaqJsonFromHtml) output = TargetOutput(file_pattern="data/cdcfaq.csv", ext="", target_class=LocalTarget) def run(self): # Use self.output() and self.input() targets to atomically copy # the file locally! with self.geneeratejson.output().open('r') as inf: data = [] qas = json.load(inf) for qa_num, qa in enumerate(qas): answer = qa["a"] q = qa["q"] q = q.replace(",", " ") if not answer.strip(): continue if not q.strip(): continue data.append({"question": q, "answer": answer}) df = pd.DataFrame(data) with self.output().open('w') as outf: df.to_csv(outf, index=False, compression='gzip')
class PlotResults(Task): """Luigi task that uses prediction and saves plots""" __version__ = "1.0" data_source = Parameter() output_pred = Parameter() output_model = Parameter() train_loc = Parameter() requires = Requires() req_1 = Requirement(ConvNeuralTest) LOCAL_ROOT = os.path.join(os.getcwd(), "data") path = os.path.join(LOCAL_ROOT, "{task.__class__.__name__}-{salt}.png") output = TargetOutput(file_pattern=path, target_class=SuffixPreservingLocalTarget, ext="") def run(self): """ Function that loads the prediction and call the show_cam plotting method """ features = np.load(self.req_1.output().path) test_path = self.req_1.output().path.rstrip("features.npy") results = np.load(test_path + "results.npy") gap_weights_l = np.load(test_path + "gap_weights_l.npy", allow_pickle=True) test_image = np.load(test_path + "image.npy") show_cam(gap_weights_l, results, features, test_image, self.output().path)
class LocalImageReduced(Task): """Luigi external task that returns a target for a small subset of train data""" __version__ = "1.0" requires = Requires() req_1 = Requirement(LocalImage) LOCAL_ROOT = os.path.join(os.getcwd(), "data") LOCAL_IMAGE = os.path.join(LOCAL_ROOT, "OCTReduced") output = TargetOutput( file_pattern=LOCAL_IMAGE, target_class=SuffixPreservingLocalTarget, ext="" ) def run(self): """ This function goes through the train/test directories and the subdirectories inside for each class and takes a small sample of images and copies them into a new directory. """ rootdir = self.req_1.output().path newpath = self.output().path for src_dir, dirs, files in os.walk(rootdir): dst_dir = src_dir.replace(rootdir, newpath, 1) if not os.path.exists(dst_dir): os.makedirs(dst_dir) counter = 0 for file_ in files: src_file = os.path.join(src_dir, file_) shutil.copy(src_file, dst_dir) counter += 1 if "train" in dst_dir and counter > 50: break elif "test" in dst_dir and counter > 10: break
class MyTask(Task): requires = Requires() other = Requirement(OtherTask) def run(self): x = MyTask z = MyTask.requires.__call__(self) requirement_taskname = z.get("other") assert type(requirement_taskname) == OtherTask
class StoreFront(Task): requires = Requires() logged_in = Requirement(LoggedIn) output = StoreFrontTarget(merchant=MERCHANT_NAME) def run(self): get_browser(merchant=MERCHANT_NAME).get( "https://www.instacart.com/store/wegmans/storefront") sleep(14)
class FetchBuilds(BaseBDroneTask): requires = Requires() date_range = Requirement(DateRange) timestamp = Parameter() def __init__(self, timestamp): super(FetchBuilds, self).__init__(timestamp) self.output_path = os.path.join(get_base_output_path(), timestamp, "builds") @delayed def fetch_data(self, repo, repo_id, start_date): page_no = 1 dfs = [] while True: payload = {"page": page_no, "branch": "main"} resp_json = self.get_result( f"{repo}/builds", payload ) if len(resp_json) > 0: pd_df = pd.DataFrame(resp_json, columns=columns).fillna('None') dfs.append(ddf.from_pandas(pd_df, npartitions=1)) if resp_json[:-1][0]["started"] < int(start_date.strftime('%s')): break page_no += 1 else: break df = ddf.concat(dfs) df["repo_id"] = repo_id df["repo_name"] = repo return df def run(self): print("########## Fetching Builds ############") date_range = self.input()["date_range"].get_date_range() self.input()["date_range"].mark_fetching() targets = [ self.fetch_data(dr["repo"], dr["repo_id"], dr["start_date"]) for dr in date_range ] dfs = ddf.concat([*compute(*targets)], axis=0) self.output().write_dask(dfs) def output(self): return CSVTarget( path=self.output_path + os.path.sep, glob="*.part" )
class LoginPage(Task): requires = Requires() browser_open = Requirement(BrowserOpen) output = LoginPageTarget(merchant=MERCHANT_NAME) def run(self): buttons = get_browser( merchant=MERCHANT_NAME).find_elements_by_css_selector("button") login_button = buttons[0] login_button.click() sleep(5)
class InfoModalDeliveryTimes(Task): requires = Requires() info_modal = Requirement(InfoModal) output = InfoModalDeliveryTimesTarget(merchant=MERCHANT_NAME) def run(self): find_by_text( get_browser(merchant=MERCHANT_NAME), "Delivery times", )[0].click() sleep(8)
class InfoModal(Task): requires = Requires() main_page = Requirement(StoreFront) trial_prompt_closed = Requirement(CloseTrialPrompt) output = InfoModalTarget(merchant=MERCHANT_NAME) def run(self): cart_button = get_browser( merchant=MERCHANT_NAME).find_element_by_css_selector( 'a[href="/wegmans/info?tab=info"]') cart_button.click() sleep(7)
class CloseTrialPrompt(Task): requires = Requires() store_front = Requirement(StoreFront) output = TrialPromptClosedTarget(merchant=MERCHANT_NAME) def run(self): buttons = find_by_text( get_browser(merchant=MERCHANT_NAME), "Got it, Thanks", ) if len(buttons) > 0: buttons[0].click() sleep(2)
class StoreBuilds(Task): requires = Requires() details = Requirement(FetchBuildDetails) timestamp = Parameter() def run(self): df = self.input()['details'].read_dask() df.set_index('number') df = df.map_partitions(lambda x: x.sort_index()) self.output().insert_data(df) self.output().mark_fetched() def output(self): return StoreBuildsTarget()
class DownloadHTMLTemplate(BaseContent): requires = Requires() contenthtmltemplate = Requirement(ContentHtml) output = TargetOutput( file_pattern="data/cdcfaq.htm", ext="", target_class=LocalTarget, format=format.Nop ) def run(self): # Use self.output() and self.input() targets to atomically copy # the file locally! with self.contenthtmltemplate.output().open('r') as inf, self.output().open('w') as outf: outf.write(inf.read())
class DownloadBotTemplate(BaseContent): requires = Requires() contentbottemplate = Requirement(ContentBotTemplate) output = TargetOutput( file_pattern="data/templates/Covidbot_template.json", ext="", target_class=LocalTarget, format=format.Nop ) def run(self): # Use self.output() and self.input() targets to atomically copy # the file locally! with self.contentbottemplate.output().open('r') as inf, self.output().open('w') as outf: outf.write(inf.read())
class CountryDimension(Task): """Luigi Task to create the country dimension table for lookups.""" requires = Requires() raw = Requirement(DownloadCSV) parent_directory = Path(os.path.dirname( os.path.realpath(__file__))).parent.parent target_filename = os.path.join(str(parent_directory), "data/country_dimension.csv") def output(self): """Specifies the LocalTarget output for the task.""" return LocalTarget(self.target_filename) def run(self): """Writes the country level statistics to data/country_dimension.csv""" raw_data = self.input()["raw"].read_dask(filename="*.part").compute() country_dimension = (raw_data.groupby(["location"]).agg({ "median_age": "max", "aged_65_older": "max", "aged_70_older": "max", "gdp_per_capita": "max", "cardiovasc_death_rate": "max", "diabetes_prevalence": "max", "female_smokers": "max", "male_smokers": "max", "handwashing_facilities": "max", "life_expectancy": "max", "human_development_index": "max", }).reset_index()) country_dimension.to_csv(self.target_filename, index=False)
class LatestWeeklyData(Task): """Luigi Task which identifies the latest weekly snapshot for each country.""" requires = Requires() other = Requirement(AggregateWeeklyData) parent_directory = Path(os.path.dirname( os.path.realpath(__file__))).parent.parent target_filename = os.path.join(str(parent_directory), "data/latest_data.csv") def output(self): """Specifies the LocalTarget output for the task.""" return LocalTarget(self.target_filename) def run(self): """Identifies the latest summary statistic by country and writes the results to data/latest_data.csv""" with self.input()["other"].open("r") as f: data = f.readlines() rows = [ele.strip().split(",") for ele in data] column_names = rows.pop(0) weekly_data = pd.DataFrame(rows, columns=column_names) datatypes = { "week": str, "new_cases": np.float64, "stringency_index": np.float64, "total_deaths": np.float64, "population": np.float64, "total_tests": np.float64, } weekly_data = weekly_data.astype(datatypes) max_week = weekly_data.groupby("location").agg({ "week": "max" }).reset_index() latest_data = weekly_data.merge(max_week) latest_data["death_per_population_pct"] = ( latest_data["total_deaths"] * 100 / latest_data["population"]) latest_data["tests_per_population"] = (latest_data["total_tests"] / latest_data["population"]) latest_data.to_csv(self.target_filename, index=False)
class extract_text(Task): """ Extract the text information from each page of slides """ LOCAL_ROOT = r'slides' course_name = Parameter('CSCI-E29') requires = Requires() data = Requirement(parse_pdf_to_single_page) output = TargetOutput(file_pattern=LOCAL_ROOT + '_{task.course_name}', ext='') def run(self): logger = logging.getLogger('luigi-interface') logger.info( "-----------Start extracting text from pdf's--------------") os.mkdir(self.output().path) single_pdf_file = sorted( glob.glob(os.path.join(self.data.output().path, r'**/*.pdf'))) content = [] label = [] for fn in single_pdf_file: if fn.endswith('slide1.pdf'): logger.info(fn.rsplit(r'----', 1)[0]) text = textract.process(fn) text = text.decode("utf-8") content.append(tokenizer(text)) label.append( os.path.basename(fn).replace('----', '##').replace('.pdf', '') + '\n') logger.info( "-----------Finish extracting text from pdf's--------------") with atomic_write( os.path.join(self.output().path, '{}.dat'.format(self.output().path))) as fn: fn.writelines(content) with atomic_write( os.path.join(self.output().path, '{}.dat.labels'.format( self.output().path))) as fn: fn.writelines(label)
class CleanedReviews(Task): __version__ = "1.0.0" subset = BoolParameter(default=True) requires = Requires() task2 = Requirement(YelpReviews) parquet_data = "./yelpdata/" output = TargetOutput(file_pattern=parquet_data, ext="", target_class=ParquetTarget) def run(self): df = self.input()["task2"].read_dask(check_complete=True) df = df[(df.user_id.notnull()) & (df.review_id.str.len() == 22)] values = {"funny": 0, "cool": 0, "useful": 0, "stars": 0} df = df.fillna(value=values) df = df.astype({"funny": int, "cool": int, "useful": int, "stars": int}) self.output().write_dask(collection=df, compression="gzip")
class ConvNeural(ExternalPythonProgramTask): """ Luigi Task to run a shell script that builds and activates a new venv """ __version__ = "1.0" data_source = Parameter() output_pred = Parameter() output_model = Parameter() requires = Requires() LOCAL_ROOT = os.path.join(os.getcwd(), "data") LOCAL_IMAGE = os.path.join(LOCAL_ROOT, "OCTReduced") extra_pythonpath = os.getcwd() virtualenv = os.path.join(extra_pythonpath, "final_project/tasks/ML_env/.venv") def program_args(self): "executes a shell script where a new venv is created for ML envirionment and the train or test code is run" data_path = os.path.join(self.LOCAL_ROOT, self.data_source) ml_path = "final_project/tasks/ML_env" model_path = self.temp_output_path if self.__class__.__name__ == "ConvNeuralTest": model_path = self.input().path return [ "./external_script.sh", data_path, self.temp_output_path, ml_path, self.virtualenv, self.action, model_path, ] def run(self): with self.output().temporary_path() as self.temp_output_path: super().run()
class LoggedIn(Task): requires = Requires() login_page = Requirement(LoginPage) output = LoggedInTarget(merchant=MERCHANT_NAME) def run(self): browser = get_browser(merchant=MERCHANT_NAME) email_form = browser.find_element_by_id( "nextgen-authenticate.all.log_in_email") # TODO: get email from task param email_form.send_keys(os.environ["EMAIL"]) sleep(5) password_form = browser.find_element_by_id( "nextgen-authenticate.all.log_in_password") # TODO: get password from task param password_form.send_keys(os.environ["PASSWORD"]) sleep(5) buttons = browser.find_elements_by_css_selector("button") login_button = buttons[2] login_button.click() sleep(14) # TODO: random delays
class GenerateBot(BaseGenerateTask): requires = Requires() geneeratejson = Requirement(GenerateFaqJsonFromHtml) downloadbottemplate = Requirement(DownloadBotTemplate) COVID_INTENTBASE = "covid" output = TargetOutput(file_pattern="data/bot/cdcfaqbot.json", ext="", target_class=LocalTarget) def run(self): # Use self.output() and self.input() targets to atomically copy # the file locally! with self.geneeratejson.output().open('r') as inf: qas = json.load(inf) with self.downloadbottemplate.output().open('r') as template: covidbot_templates = json.load(template) covidbotresource = covidbot_templates covid_intents = covidbotresource["resource"]["intents"] for qa_num, qa in enumerate(qas): answer = qa["a"] q = qa["q"] q = q.replace(",", " ") if not answer.strip(): continue if not q.strip(): continue intent_name = f"{self.COVID_INTENTBASE}{num_to_char(qa_num)}" lex_template = json.loads(INTENT_JSON) resource = lex_template resource["name"] = intent_name resource["sampleUtterances"] = gen_sample_utterances(q, qa) resource["conclusionStatement"]["messages"][0][ "content"] = answer covid_intents.append(resource) with self.output().open('w') as outf: json.dump(covidbot_templates, outf, indent=2)
class AggregateWeeklyData(Task): """Luigi task which aggregates covid daily stats by week.""" requires = Requires() other = Requirement(DownloadCSV) parent_directory = Path(os.path.dirname( os.path.realpath(__file__))).parent.parent target_filename = os.path.join(str(parent_directory), "data/weekly_data.csv") def output(self): """Specifies the LocalTarget output for the task""" return LocalTarget(self.target_filename) def run(self): """Aggregates case volume, stringency index and additional stats by country and week. Writes the results to data/weekly_data.csv """ raw_data = self.input()["other"].read_dask(filename="*.part").compute() raw_data["date"] = pd.to_datetime(raw_data.date) raw_data["week"] = raw_data["date"].apply( lambda x: x - pd.Timedelta(days=x.weekday())) raw_data["week"] = raw_data["week"].dt.date.apply(lambda x: str(x)) weekly_data = (raw_data.groupby(["location", "week"]).agg({ "new_cases": "sum", "stringency_index": "max", "total_deaths": "max", "population": "max", "total_tests": "max", }).reset_index().dropna()) weekly_data.to_csv(self.target_filename, index=False)
class CleanedHeadlines(Task): ''' This class loads the data from the AWS instance if it exists and preprocesses the data for analysis The class returns a dataframe with pre-processed reviews that can be loaded in the Topic Modeling class for additional analysis :input : s3 Path to Article Headlines :output : Creates a Local Parquet File with the preprocessed data ''' subset = BoolParameter(default=True) requires = Requires() article_headlines = Requirement(ArticleHeadlines) date = datetime.datetime.now() date_suffix = str(date.month) + '_' + str(date.day) + '_' + str(date.year) output = TargetOutput( target_class=ParquetTarget, ext='-' + date_suffix, glob="*.parquet", ) def run(self): dsk = self.input()['article_headlines'].read_dask( dtype={ "publish_date": "int32", "headline_text": "str", "headline_id": "str" }, storage_options=dict(requester_pays=True), ) # dsk_df = dsk.compute() headlines_concat = "".join(dsk["headline_id"]) headlines_hash = hash_str(headlines_concat, get_csci_salt()).hex()[:8] self.output().write_dask(dsk, compression="gzip") def print_results(self): print(self.output().read_dask().compute())
class BySomething(Task): __version__ = "1.0.0" requires = Requires() task3 = Requirement(CleanedReviews) # Be sure to read from CleanedReviews locally output = TargetOutput( file_pattern="./yelpdata/", ext=".parquet", target_class=ParquetTarget, flag=None, glob="*.parquet", storage_options=dict(requester_pays=True), ) def run(self): df = self.input()["task3"].read_dask( check_complete=True, column=["year", "stars", "text"] ) df["decade"] = (df["date"].dt.year // 10) % 10 df["text_length"] = df["text"].str.len() by_decade = df[["decade", "text_length"]].groupby("decade").mean().round() by_star = df[["stars", "text_length"]].groupby("stars").mean().round() df.to_csv("./yelpdata/YelpReviewCleansed/*.csv", index=False, sep=',', encoding='utf-8') self.by_decade = by_decade self.by_star = by_star print(by_decade.compute()) print(by_star.compute()) def print_by_decade(self): print(self.by_decade.compute()) def print_by_star(self): print(self.by_decade.compute())
class GetDeliveryTimes(Task): requires = Requires() main_page = Requirement(InfoModalDeliveryTimes) # get_time_window() makes sure we don't run this more than once every 5 minutes output = TargetOutput( file_pattern=os.path.join( "data", "delivery_times_{}".format(get_time_window()), ""), ext=".parquet", target_class=ParquetTarget, ) def run(self): # Commented out because it significantly increases run time # self.detect_load_more_times_button() if self.detect_no_deliveries(): self.output().write_dask( dd.from_pandas(pd.DataFrame([]), chunksize=1)) else: self.output().write_dask( dd.from_pandas(self.detect_delivery_times(), chunksize=1)) def detect_delivery_times(self): header = find_by_text( get_browser(merchant=MERCHANT_NAME), "Available Scheduled Times", )[0] section = get_parent(get_parent(header)) return pd.DataFrame(self.parse_delivery_times(section.text)) def parse_delivery_times(self, text): delivery_times = [] for line in text.splitlines(): if is_date(line): date = line if is_time(line): delivery_times.append({"date": date, "time": line}) if is_money(line): delivery_times[-1]["price"] = line return delivery_times def detect_load_more_times_button(self): while True: try: button = get_browser( merchant=MERCHANT_NAME).find_element_by_xpath( '//button[text()="More times"]') button.click() sleep(5) except NoSuchElementException: return def detect_no_deliveries(self): try: get_browser(merchant=MERCHANT_NAME).find_element_by_css_selector( 'img[alt="All delivery windows are full"]') return True except: return False def print_results(self): print(self.get_results()) def get_results(self): return self.output().read_dask().compute()
class similarity_calc(Task): """ Calculate similarity based on BM 25 model, and return the top 10 most related pages of each slide. """ LOCAL_ROOT = r'static/ranking_results' course_name = Parameter('CSCI-E29') requires = Requires() data = Requirement(extract_text) output = TargetOutput( file_pattern=os.path.join(LOCAL_ROOT, '{task.course_name}') + r'/', ext='', target_class=CSVTarget) def run(self): logger = logging.getLogger('luigi-interface') logger.info("-----------Start calculating similarity--------------") cfg_course_path = os.path.join(self.data.output().path, 'config.toml') if not os.path.exists(cfg_course_path): cfg_course = toml.load('config.toml') cfg_course['dataset'] += '_' + self.course_name cfg_course['index'] += '_' + self.course_name with atomic_write(cfg_course_path, 'w') as f: f.write(toml.dumps(cfg_course)) idx = metapy.index.make_inverted_index(cfg_course_path) ranker = metapy.index.OkapiBM25() top_k = 10 query = metapy.index.Document() with open( os.path.join(self.data.output().path, '{}.dat.labels'.format(self.data.output().path)), 'r') as fn: label_list = fn.read().splitlines() with open( os.path.join(self.data.output().path, '{}.dat'.format(self.data.output().path)), 'r') as fn: txt_list = fn.read().splitlines() out = pd.DataFrame(columns=range(21)) for i in range(len(label_list)): if not i % 10: logger.info('processing---{}/{}'.format( str(i), len(label_list))) row = [label_list[i]] query.content(txt_list[i]) result = ranker.score(idx, query, top_k + 1) if len(result) > 1: top_similarity = result[0] result = [res for res in result if res[0] != i][:top_k] result_normalize = [(label_list[res[0]], res[1] / top_similarity[1]) for res in result] result_normalize = [ item for item_pair in result_normalize for item in item_pair ] row += result_normalize out.loc[i, range(len(row))] = row out_dd = dd.from_pandas(out, npartitions=1) self.output().write_dask(out_dd, header=False, index=False) logger.info("-----------Finish calculating similarity--------------")