def __init__(self, test=False): "Inits the script" Trace.info("Starting" + (" ", " test")[test] +" script...") # change paths and indexes in case of test if test: # path of the files self.hotels_file = os.path.join(self.filedir,"./data/hotels_test.csv") self.comments_file = os.path.join(self.filedir,"./data/comments_test.csv") self.bitext_file = os.path.join(self.filedir,"./data/bitext_tuipilot_test.csv") # indexes self.hotels_index = "test_hotels" self.comments_index = "test_comments" self.bitext_index = "test_bitext" self.bitext_unique_index = "test_bitext_unique" self.bitext_unique_posneg_index = "test_bitext_unique_posneg" # hotels first self.build_hotels_index() # then comments self.build_comments_index() # then the rest self.build_bitext_indexes() Trace.info(("S", "Test s")[test] + "cript finished.")
def process_package(self, package_name): _id = package_name.replace("/","_") # grab npmjs registry information npm_registry_info = json.loads(requests.get("http://registry.npmjs.org/" + package_name).text) Trace.info("npm_registry_info processed ok") # grab npm-stat_info today = date.today() month_ago = today - timedelta(30) npm_stat_info = json.loads(requests.get("http://npm-stat.com/downloads/range/" + date.strftime(month_ago, "%Y-%m-%d") + ":" + date.strftime(today, "%Y-%m-%d") + "/" + package_name).text) Trace.info("npm_stat_info processed ok") # build the doc and feed elasticsearch # _type first. _type will be the repo of the package. "no_repo" in case there is no repo. _type = "no_repo" if ("repository" in npm_registry_info and "type" in npm_registry_info["repository"]): _type = npm_registry_info["repository"]["type"].replace("/","_") # init document with versions document = { "versions": 0 } if "versions" in npm_registry_info: document["versions"] = len(npm_registry_info["versions"].keys()) # calculate downloads downloads = [0] if "downloads" in npm_stat_info and len(npm_stat_info["downloads"]) > 0: downloads = [item["downloads"] for item in npm_stat_info["downloads"]] document["average_downloads"] = reduce(lambda x, y: x + y, downloads) / len(downloads) # insert document Trace.info("about to upsert") Trace.info(json.dumps(self.elasticsearch.upsert_document(self._index, _type, _id, document))) Trace.info("upserted")
def build_npm_packages_index(self): global test_packages package_names = [] # check if testing if test_packages != None and len(test_packages) > 0: package_names = test_packages Trace.info("Testing. Packages reduced to: " + str(len(package_names))) else: #not testing # get all the docs Trace.info("grabbing all packages from npm registry...") packages = json.loads(requests.get("https://skimdb.npmjs.com/registry/_all_docs").text)["rows"] package_names = [item["id"] for item in packages] Trace.info(str(len(package_names)) + " total packages grabbed") # apply offset package_names = package_names[self._offset:] Trace.info("Offset. Packages reduced to: " + str(len(package_names))) # go through them and feed elasticsearch for package_name in package_names: Trace.info("processing package: " + package_name) try: self.process_package(package_name) except: print ("Error processing package: " + package_name + ": " + str(sys.exc_info()[0])) continue
def build_comments_index(self): Trace.info("Building comments index...") # build the typemap comments_typemap = {"averageWebScore": int} comments_replace = [{"key":"commentId", "find":".", "replace":""}, {"key":"hotelSequence", "find":".", "replace":""}] # get the bulk of documents comments = CsvManager.read(self.comments_file, typemap=comments_typemap, replace=comments_replace) Trace.info(str(len(comments)) + " comments read") # bulk_upsert comments_upserted = self.elasticsearch.upsert_bulk(self.comments_index, "hotelSequence", "commentId", comments) Trace.info(str(comments_upserted) + " comments upserted in " + self.comments_index)
def build_hotels_index(self): Trace.info("Building hotels index...") # build the typemap hotels_keys = CsvManager.read_keys(self.hotels_file) hotels_typemap = dict(zip(hotels_keys[3:], [int]*len(hotels_keys[3:]))) hotels_replace = [{"key":"hotelSequence", "find":".", "replace":""}, {"key":"mailsEnviados", "find":".", "replace":""}] # get the bulk of documents hotels = CsvManager.read(self.hotels_file, typemap=hotels_typemap, replace=hotels_replace) Trace.info(str(len(hotels)) + " hotels read") # bulk_upsert hotels_upserted = self.elasticsearch.upsert_bulk(self.hotels_index, "destinationCode", "hotelSequence", hotels) Trace.info(str(hotels_upserted) + " hotels upserted in " + self.hotels_index)
def __init__(self, test=False, offset = 0): "Inits the script" global test_packages Trace.info("Starting" + (" ", " test")[test] +" script...") # change paths and indexes in case of test if test: test_packages_file = os.path.join(self.filedir,"./data/test_npm_package_names") test_packages = [item["test_package_name"] for item in CsvManager.read(test_packages_file)] self._index = "test_npm_packages" Trace.info("test_packages: " + json.dumps(test_packages)) # set offset self._offset = offset # build npm_packages_index self.build_npm_packages_index() Trace.info(("S", "Test s")[test] + "cript finished.")
def build_bitext_indexes(self): "Builds bitext, bitext_unique and bitext_unique_posneg indexes" Trace.info("Building bitext, bitext_unique and bitext_unique_posneg indexes...") # typemap and replace bitext_replace = [{"key":"score", "find":",", "replace":"."}] bitext_typemap = {"score": float} # get the bulk of bitexts bitexts = CsvManager.read(self.bitext_file, typemap=bitext_typemap, replace=bitext_replace) # iterate the bulk of bitexts and insert the element in each of the indexes for _id,bitext_item in enumerate(bitexts): # add info from hotels hotel = self.elasticsearch.read_document(self.hotels_index, "_all", bitext_item["hotelSequence"]) if "found" in hotel and hotel["found"]: # add found hotel fields to bitext item bitext_item = dict(bitext_item.items() + hotel["_source"].items()) # upsert element bitext_type = bitext_item["section"] del bitext_item["section"] Trace.info("upserting bitext " + str(_id)) self.elasticsearch.upsert_document(self.bitext_index, bitext_type, str(_id), bitext_item) # update bitext_unique_posneg index previous_average_score = 0 previous_count = 0 previous_categories = "" separator = "" bitext_unique_posneg_id = bitext_item["commentId"] + bitext_type bitext_unique_posneg_item = self.elasticsearch.read_document(self.bitext_unique_posneg_index, "_all", bitext_unique_posneg_id) if "found" in bitext_unique_posneg_item and bitext_unique_posneg_item["found"]: previous_count = bitext_unique_posneg_item["_source"]["count"] previous_average_score = bitext_unique_posneg_item["_source"]["averageScore"] previous_categories = bitext_unique_posneg_item["_source"]["category"] separator = ", " bitext_unique_posneg_upsert_doc = { "section": bitext_type, "averageScore": 1.0*(previous_average_score*previous_count + bitext_item["score"])/(previous_count + 1), "count": previous_count + 1, "category": previous_categories + separator + bitext_item["category"] } # upsert self.elasticsearch.upsert_document(self.bitext_unique_posneg_index, bitext_item["hotelSequence"], bitext_unique_posneg_id, bitext_unique_posneg_upsert_doc) # update bitext_unique index previous_average_score = 0 previous_count = 0 previous_categories = "" separator = "" bitext_unique_id = bitext_item["commentId"] bitext_unique_item = self.elasticsearch.read_document(self.bitext_unique_index, "_all", bitext_unique_id) if "found" in bitext_unique_item and bitext_unique_item["found"]: previous_count = bitext_unique_item["_source"]["count"] previous_average_score = bitext_unique_item["_source"]["averageScore"] previous_categories = bitext_unique_item["_source"]["category"] separator = ", " bitext_unique_upsert_doc = { "averageScore": 1.0*(previous_average_score*previous_count + bitext_item["score"])/(previous_count + 1), "count": previous_count + 1, "category": previous_categories + separator + bitext_item["category"] } # look for the comment in the comment index comment = self.elasticsearch.read_document(self.comments_index, "_all", bitext_unique_id) if "found" in comment and comment["found"]: # add found comment averageWebScore to bitext unique item bitext_unique_upsert_doc["averageWebScore"] = comment["_source"]["averageWebScore"] bitext_unique_upsert_doc["scoresDiff"] = bitext_unique_upsert_doc["averageScore"] - bitext_unique_upsert_doc["averageWebScore"] bitext_unique_upsert_doc["scoresAbsDiff"] = math.fabs(bitext_unique_upsert_doc["scoresDiff"]) # upsert self.elasticsearch.upsert_document(self.bitext_unique_index, bitext_item["hotelSequence"], bitext_unique_id, bitext_unique_upsert_doc)
self.assertEquals(last_bitext["_source"]["score"], 2.0) self.assertEquals(last_bitext["_source"]["mailsEnviados"], 37) # test bitext_unique_posneg index bitext330956POS = self.elasticsearch.read_document("test_bitext_unique_posneg", "69559", "330956POS") self.assertTrue(bitext330956POS["found"]) self.assertEquals(bitext330956POS["_source"]["averageScore"], 2.0) # test bitext_unique index bitext330956 = self.elasticsearch.read_document("test_bitext_unique", "69559", "330956") self.assertTrue(bitext330956["found"]) self.assertEquals(bitext330956["_source"]["averageScore"], 2.0) self.assertEquals(bitext330956["_source"]["averageWebScore"], 5) self.assertEquals(bitext330956["_source"]["scoresDiff"], -3.0) self.assertEquals(bitext330956["_source"]["scoresAbsDiff"], 3.0) def tearDown(self): # delete indexes self.elasticsearch.remove_index("test_hotels") self.elasticsearch.remove_index("test_comments") self.elasticsearch.remove_index("test_bitext") self.elasticsearch.remove_index("test_bitext_unique_posneg") self.elasticsearch.remove_index("test_bitext_unique") if __name__ == '__main__': #unittest.main() if len(sys.argv)>1 and sys.argv[1] == "test": Trace.info("test") unittest.main(argv=sys.argv[:1], exit=True) else: Trace.info("main") _Main()
@unittest.skipIf(not(elasticsearch.is_up()), "irrelevant test if there is no elasticsearch instance") def test_script(self): global test_packages _Main(test = True) # count documents self.assertTrue(self.elasticsearch.count_documents("test_npm_packages") > 0) # assert express express_package = self.elasticsearch.read_document("test_npm_packages", "_all", "express") self.assertTrue(express_package["found"]) def tearDown(self): # delete indexes self.elasticsearch.remove_index("test_npm_packages") if __name__ == '__main__': #unittest.main() if len(sys.argv) > 1 and sys.argv[1] == "test": Trace.info("test") unittest.main(argv=sys.argv[:1], exit=True) else: if len(sys.argv) > 1: try: offset = int(sys.argv[1]) Trace.info("main with offset: " + str(offset)) _Main(offset = offset) except: pass else: Trace.info("main") _Main()