def run(self):
     print("Beginning StoreLabel() task ...")
     print("Storing keyword filter labels in MongoDB ...")
     client = initialize_mongo()
     dump = {}
     with self.input().open("r") as in_file:
         for filtered_text in in_file:
             filtered_text = json.loads(filtered_text)
             for doc_id, label in filtered_text.items():
                 # target = MongoCellTarget(client, self.db, self.collection, doc_id, self.field)
                 # target.write(label)
                 # target.exists()
                 dump[doc_id] = label
     doc_ids = list(dump.keys())
     target = MongoRangeTarget(client, self.db, self.collection, doc_ids, self.field)
     target.write(dump)
     target.exists()
     print("StoreLabel() task complete")
     print("{} new labels stored in MongoDB".format(len(doc_ids)))
     print(
         "{} posts contain keywords".format(
             sum(value == 1 for value in dump.values())
         )
     )
     print("Clearing all out_files ...")
     os.remove("urls.txt")
     os.remove("extracted_text.txt")
     os.remove("translated_text.txt")
     os.remove("filtered_text.txt")
     os.remove("dummy_extraction.txt")
     os.remove("dummy_translation.txt")
     print("Out_files cleared")
 def run(self):
     print("Beginning SourceData() task ...")
     print("Getting image urls from MongoDB ...")
     client = initialize_mongo()
     target = MongoCollectionTarget(client, self.db, self.collection)
     coll = target.get_collection()
     print(coll)
     end = datetime.utcnow() - timedelta(
         days=30)  # testing filter on 1 month old data
     start = end - timedelta(days=1)
     with self.output().open("w") as out_file:
         dump = {}
         for i in coll.find({
                 "scraped_date": {
                     '$gte': start,
                     '$lt': end
                 }
         }).limit(100):  #limit for testing
             if i["media_type"] == "image":
                 url = i["s3_url"]
                 doc_id = str(i["_id"])
                 dump[doc_id] = url
         out_file.write(json.dumps(dump))
         print("SourceData() task complete")
         print("Image urls written to out_file")
    def run(self):
        print("Beginning StoreTranslation() task ...")
        print("Storing translated text in MongoDB ...")
        client = initialize_mongo()
        dump = {}
        with self.input().open("r") as in_file:
            for translated_text in in_file:
                translated_text = json.loads(translated_text)
                for doc_id, text in translated_text.items():
                    # target = MongoCellTarget(client, self.db, self.collection, doc_id, self.field)
                    # target.write(label)
                    dump[doc_id] = text
        doc_ids = list(dump.keys())
        target = MongoRangeTarget(client, self.db, self.collection, doc_ids, self.field)
        target.write(dump)
        print("StoreTranslation() task complete")
        print("{} image text translations stored in MongoDB".format(len(doc_ids)))

        # Write a dummy output file so that StoreLabel's dependency is fulfilled
        with self.output().open("w") as out_file:
            out_file.write("done")