def test_ingest_ok(self, mock_writeToElasticsearch): test_event = {"datasource": "ntl"} test_config = { "data-sources": { "ntl": { "type": "ntl", "url": self.mock_dataset_url } } } ingest.makeQueryCall = MagicMock( return_value=UtilsTest().get_ntl_mock_data()) mock_formatter_factory = FormatterFactory() mock_formatter_factory.get_formatter = MagicMock( return_value=NTLDataFormatter()) mock_formatter = NTLDataFormatter() mock_formatter.get_data_objects = MagicMock() mock_slack_notifier = SlackNotifier(None, None) mock_slack_notifier.sendSlackNotification = MagicMock() ingest.ingest(test_event, test_config) mock_writeToElasticsearch.assert_called_once()
def daily(self): print("Running Daily Job") #the ingest function sorts and moves files by date into the working/media directory ingest.ingest(ingestdir, workingdir) #the crawl function performs a hash index of all files in the target directories workingdirsum = crawl.crawl(True, workingdir, jsondatadir) archivedirsum = crawl.crawl(False, archivedir, jsondatadir) #the dedupe function combines all hash indexes and analyzes the dataset for duplicates data_files = glob.glob(jsondatadir + '/*.json') #run the dedupe function dedupe.dedupe(data_files, duplicatedir)
def test_ingest_error_on_es(self, mock_sendSlackNotification, mock_writeToElasticsearch): test_event = {"datasource": "ntl"} test_config = { "data-sources": { "ntl": { "type": "ntl", "url": self.mock_dataset_url } } } ingest.makeQueryCall = MagicMock( return_value=UtilsTest().get_ntl_mock_data()) mock_writeToElasticsearch.side_effect = Exception("Test Exception") ingest.ingest(test_event, test_config) mock_writeToElasticsearch.assert_called_once() mock_sendSlackNotification.assert_called_once()
def test_ingest_invalid_formatter(self, mock_sendSlackNotification, mock_get_formatter): test_event = {"datasource": "ntl"} test_config = { "data-sources": { "ntl": { "type": "ntl", "url": self.mock_dataset_url } } } ingest.makeQueryCall = MagicMock( return_value=UtilsTest().get_ntl_mock_data()) mock_get_formatter.return_value = None ingest.ingest(test_event, test_config) mock_get_formatter.assert_called_once() mock_sendSlackNotification.assert_called_once()
def run(data_dir): """ Run the pipeline, intermediate files go into data/extracted, data/parsed, and data/standardized which is ingested into ./expenses.db """ cores = mp.cpu_count() pool = mp.Pool(cores) jobs = [] raw_dir = os.path.join(data_dir, "raw") extracted_dir = os.path.join(data_dir, "extracted") parsed_dir = os.path.join(data_dir, "parsed") standardized_dir = os.path.join(data_dir, "standardized") if len(os.listdir(raw_dir)) == 0: return False make_dirs([extracted_dir, parsed_dir, standardized_dir]) with tempfile.TemporaryDirectory() as tmp_standardized_dir: for raw, extracted, parsed, standardized in get_pipeline_files( raw_dir, extracted_dir, parsed_dir, tmp_standardized_dir): jobs.append( pool.apply_async(_etl, (raw, extracted, parsed, standardized))) [job.get() for job in jobs] # TODO: hardcoded expenses tablename and expenses.db ingest( get_files(tmp_standardized_dir), "expenses", os.path.join(data_dir, "expenses.db"), ) for file_ in os.listdir(tmp_standardized_dir): os.replace( os.path.join(tmp_standardized_dir, file_), os.path.join(standardized_dir, file_), ) return True
def trigger_ingest(): url = request.args.get("url") table_name = ingest(url) with psycopg2.connect("host=localhost") as conn: with conn.cursor() as cur: cur.execute( "select column_name from information_schema.columns where table_name = %s and column_name != 'geom'", [table_name]) columns = [row[0] for row in cur.fetchall()] for column_name in columns: annotate.suggest_concept(table_name, column_name) return jsonify({"status": "ok"})
def test(self): print("Running Full Test Sequence") #the ingest function sorts and moves files by date into the working/media directory ingest.ingest(ingestdir, workingdir) #the crawl function performs a hash index of all files in the target directories workingdirsum = crawl.crawl(True, workingdir, jsondatadir) archivedirsum = crawl.crawl(False, archivedir, jsondatadir) #the dedupe function combines all hash indexes and analyzes the dataset for duplicates data_files = glob.glob(jsondatadir + '/*.json') #run the dedupe function dedupe.dedupe(data_files, duplicatedir) #after the dedupe function has moved duplicaes out, reindex workingdirsum = crawl.crawl(True, workingdir, jsondatadir) #the archive function pulls from the working/media directory and pools into sized volumes archive.archive(archivedir, jsondatadir, workingdir, mediasize) #validate that all files in duplicates exist elsewhere before moving to validated validate.validate(duplicatedir, workingdir, archivedir, validateddir) print("Daily Job Completed Successfully")
def process(self, filepath): # ignore hidden files (e.g., .gitignore) if not os.path.basename(filepath)[0] == '.': try: processed = self.files[filepath] except KeyError: log.info( 'new file in dropbox %s created %s', filepath, datetime.datetime.fromtimestamp( os.path.getctime(filepath))) processed = False if not processed and not os.path.basename( filepath) == '.gitignore': try: _id = ingest.ingest(filepath) # move file to ingested directory dest = add_id( _id, os.path.join(app.ingested_path, os.path.basename(filepath))) log.info('moving ingested file from %s to %s', filepath, dest) shutil.move(filepath, dest) except Exception as e: log.warn('failed to ingest %s', filepath) log.warn(e) _id = save_file_metadata(filepath, status='error') # move file to failed directory dest = add_id( _id, os.path.join(app.failed_path, os.path.basename(filepath))) log.info('moving failed file from %s to %s', filepath, dest) shutil.move(filepath, dest) processed = True self.files[filepath] = processed
def testNoTitle(): import ingest if os.path.exists(TEST_OUTPUT): shutil.rmtree(TEST_OUTPUT) if not os.path.exists(TEST_INPUT): os.makedirs(TEST_INPUT) filename = os.path.join(TEST_INPUT, 'no-title.md') fh = open(filename, 'w') fh.write(noTitle) fh.close() loc = ingest.ingest(filename, TEST_OUTPUT) notesFolders = [os.path.join(loc, f) for f in os.listdir(loc)] assert len(notesFolders) == 1 with open(os.path.join(notesFolders[0], 'note.md')) as fh: content = fh.read() assert 'jd9d09j1290js1902js129nvsvns' in content
def testPreSection(): import ingest if os.path.exists(TEST_OUTPUT): shutil.rmtree(TEST_OUTPUT) if not os.path.exists(TEST_INPUT): os.makedirs(TEST_INPUT) filename = os.path.join(TEST_INPUT, 'pre-section.md') fh = open(filename, 'w') fh.write(preSection) fh.close() loc = ingest.ingest(filename, TEST_OUTPUT) notesFolders = [os.path.join(loc, f) for f in os.listdir(loc)] assert len(notesFolders) == 1 with open(os.path.join(notesFolders[0], 'note.md')) as fh: content = fh.read() assert 'f48fh309dj0913dj9j190dj029' in content
def testIngest(): import ingest if os.path.exists(TEST_OUTPUT): shutil.rmtree(TEST_OUTPUT) if not os.path.exists(TEST_INPUT): os.makedirs(TEST_INPUT) filename = os.path.join(TEST_INPUT, 'ingest.md') fh = open(filename, 'w') fh.write(simulatedIngestDotMd) fh.close() loc = ingest.ingest(filename, TEST_OUTPUT) notesFolders = [f.split('_')[0] for f in os.listdir(loc)] print(notesFolders) assert time1.split(' ')[0] in notesFolders assert time2.split(' ')[0] in notesFolders # When no time is set in note, use current time. nowTimeAsFolderStr = ingest.unixTimeAsSafeStr(time.time()).split(' ')[0] assert nowTimeAsFolderStr in notesFolders # Make sure that note that had no time now has time in it. notesFolders = [os.path.join(loc, f) for f in os.listdir(loc)] filename = None for nf in notesFolders: if nowTimeAsFolderStr in nf: assert filename is None filename = os.path.join(nf, 'note.md') with open(filename, 'r') as fh: content = fh.read() assert 'time::' in content
def testPoundIsFirst(): import ingest if os.path.exists(TEST_OUTPUT): shutil.rmtree(TEST_OUTPUT) if not os.path.exists(TEST_INPUT): os.makedirs(TEST_INPUT) poundFirst = """# The title the content """ filename = os.path.join(TEST_INPUT, 'ingest.md') fh = open(filename, 'w') fh.write(poundFirst) fh.close() loc = ingest.ingest(filename, TEST_OUTPUT, poundFirst.splitlines()) notesFolders = [f.split('_')[0] for f in os.listdir(loc)] print(notesFolders)
def process(self, filepath): # ignore hidden files (e.g., .gitignore) if not os.path.basename(filepath)[0] == ".": try: processed = self.files[filepath] except KeyError: log.info( "new file in dropbox %s created %s", filepath, datetime.datetime.fromtimestamp(os.path.getctime(filepath)), ) processed = False if not processed and not os.path.basename(filepath) == ".gitignore": try: _id = ingest.ingest(filepath) # move file to ingested directory dest = add_id(_id, os.path.join(app.ingested_path, os.path.basename(filepath))) log.info("moving ingested file from %s to %s", filepath, dest) shutil.move(filepath, dest) except Exception as e: log.warn("failed to ingest %s", filepath) log.warn(e) _id = save_file_metadata(filepath, status="error") # move file to failed directory dest = add_id(_id, os.path.join(app.failed_path, os.path.basename(filepath))) log.info("moving failed file from %s to %s", filepath, dest) shutil.move(filepath, dest) processed = True self.files[filepath] = processed
#!/usr/bin/env python3 from ingest import ingest try: import cPickle as pickle except: import pickle import sys if __name__ == '__main__': source = ingest(sys.argv[1]) with open(sys.argv[2], 'wb') as f: pickle.dump(source, f) else: with open(sys.argv[1], 'rb') as f: source = pickle.load(f)
from ingest import ingest from itertools import chain from pymongo import MongoClient # functions to for handling init init_client = lambda uri='': MongoClient(uri) init_db = lambda client, database_name: client[database_name] # insertion functions insert_restaurants_by_borough = lambda db, f: list( map(lambda r: db[r['borough']].insert_one(r), ingest(f))) # query functions get_all_by_borough = lambda db, borough: list(db[borough].find()) get_all_by_zipcode = lambda db, zipcode: list( chain.from_iterable( map(lambda c: list(db[c].find({"address.zipcode": zipcode})), db.collection_names()))) get_all_by_zipcode_and_grade = lambda db, zipcode, grade: list( chain.from_iterable( map( lambda c: list(db[c].find({ "address.zipcode": zipcode, "grades.0.grade": grade })), db.collection_names()))) get_all_by_zipcode_and_score = lambda db, zipcode, score: list( chain.from_iterable(
for _ in range(num_args - 1): cur_delim_end = recv_msg.rfind(':*:', 0, last_delim_start) + 3 # Marks the index after the current delimiter args.insert(0, recv_msg[cur_delim_end:last_delim_start]) last_delim_start = cur_delim_end - 3 # raise Exception('First Delim: ', first_delim_end, 'Last delim: ', cur_delim_end - 3) # raise Exception(args) args.insert(0, recv_msg[first_delim_end:last_delim_start]) # command = cmd_list[0] # This dictionary uses the command passed from the frontend to run the relevant workspace-/index-related functions # print(args) cmd_dict = { 'create-workspace': lambda: create_workspace(name=args[0]), 'delete-workspace': lambda: delete_workspace(guid=args[0]), 'import-folder': lambda: ingest(path=args[0], import_type='kive', workspace_guid=args[1]), 'import-file': lambda: ingest(path=args[0], import_type='kive', workspace_guid=args[1]), 'import-wsb': lambda: ingest(path=args[0], import_type='wsb', workspace_guid=args[1]), 'import-sb': lambda: ingest(path=args[0], import_type='sb', workspace_guid=args[1]), 'delete-files': lambda: delete(json_lst=args[0], workspace_guid=args[1]), 'update-files': lambda: update(json_lst=args[0], workspace_guid=args[1]), 'search': lambda: search_from_strs(search_text=args[0], leg_datetime_range=args[1], kive_datetime_range=args[2], la_datetime_range=args[3], media_text_lst=args[4], fields_lst=args[5], workspace_guid=args[6]) } cmd_dict[command]()
import numpy as np from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import TfidfVectorizer import joblib from ingest import ingest from labelizeTweets import labelizeTweets from tweet_tokenize import tokenize # Get the file into a DF for training file = r"C:\Users\mayank.nagar\Desktop\ML\twitter_analysis\train_data\Sentiment-Analysis-Dataset\SentimentAnalysisDataset.csv" df = ingest(file) print("File received and processed into dataframe") df['tokens'] = df['SentimentText'].map(tokenize) print("Dataframe tokenization completed") # Split the DF into training and testing x_train, x_test, y_train, y_test = train_test_split(np.array(df.tokens), np.array(df.Sentiment), test_size=0.2) x_train = labelizeTweets(x_train, 'TRAIN') x_test = labelizeTweets(x_test, 'TEST') print("Dataframe split into training and test completed") corpus = [x.words for x in x_train] print("Training TF-IDF vector") vectorizer = TfidfVectorizer(analyzer=lambda x: x, min_df=100) matrix = vectorizer.fit_transform(corpus) tfidf = dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))
def ingest(self): print("Ingesting Files") #the ingest function sorts and moves files by date into the working/media directory a = ingest.ingest(ingestdir, workingdir)