def run(self): print("Beginning StoreLabel() task ...") print("Storing keyword filter labels in MongoDB ...") client = initialize_mongo() dump = {} with self.input().open("r") as in_file: for filtered_text in in_file: filtered_text = json.loads(filtered_text) for doc_id, label in filtered_text.items(): # target = MongoCellTarget(client, self.db, self.collection, doc_id, self.field) # target.write(label) # target.exists() dump[doc_id] = label doc_ids = list(dump.keys()) target = MongoRangeTarget(client, self.db, self.collection, doc_ids, self.field) target.write(dump) target.exists() print("StoreLabel() task complete") print("{} new labels stored in MongoDB".format(len(doc_ids))) print( "{} posts contain keywords".format( sum(value == 1 for value in dump.values()) ) ) print("Clearing all out_files ...") os.remove("urls.txt") os.remove("extracted_text.txt") os.remove("translated_text.txt") os.remove("filtered_text.txt") os.remove("dummy_extraction.txt") os.remove("dummy_translation.txt") print("Out_files cleared")
def run(self): print("Beginning SourceData() task ...") print("Getting image urls from MongoDB ...") client = initialize_mongo() target = MongoCollectionTarget(client, self.db, self.collection) coll = target.get_collection() print(coll) end = datetime.utcnow() - timedelta( days=30) # testing filter on 1 month old data start = end - timedelta(days=1) with self.output().open("w") as out_file: dump = {} for i in coll.find({ "scraped_date": { '$gte': start, '$lt': end } }).limit(100): #limit for testing if i["media_type"] == "image": url = i["s3_url"] doc_id = str(i["_id"]) dump[doc_id] = url out_file.write(json.dumps(dump)) print("SourceData() task complete") print("Image urls written to out_file")
def run(self): print("Beginning StoreTranslation() task ...") print("Storing translated text in MongoDB ...") client = initialize_mongo() dump = {} with self.input().open("r") as in_file: for translated_text in in_file: translated_text = json.loads(translated_text) for doc_id, text in translated_text.items(): # target = MongoCellTarget(client, self.db, self.collection, doc_id, self.field) # target.write(label) dump[doc_id] = text doc_ids = list(dump.keys()) target = MongoRangeTarget(client, self.db, self.collection, doc_ids, self.field) target.write(dump) print("StoreTranslation() task complete") print("{} image text translations stored in MongoDB".format(len(doc_ids))) # Write a dummy output file so that StoreLabel's dependency is fulfilled with self.output().open("w") as out_file: out_file.write("done")