def build_npm_packages_index(self): global test_packages package_names = [] # check if testing if test_packages != None and len(test_packages) > 0: package_names = test_packages Trace.info("Testing. Packages reduced to: " + str(len(package_names))) else: #not testing # get all the docs Trace.info("grabbing all packages from npm registry...") packages = json.loads(requests.get("https://skimdb.npmjs.com/registry/_all_docs").text)["rows"] package_names = [item["id"] for item in packages] Trace.info(str(len(package_names)) + " total packages grabbed") # apply offset package_names = package_names[self._offset:] Trace.info("Offset. Packages reduced to: " + str(len(package_names))) # go through them and feed elasticsearch for package_name in package_names: Trace.info("processing package: " + package_name) try: self.process_package(package_name) except: print ("Error processing package: " + package_name + ": " + str(sys.exc_info()[0])) continue
def process_package(self, package_name): _id = package_name.replace("/","_") # grab npmjs registry information npm_registry_info = json.loads(requests.get("http://registry.npmjs.org/" + package_name).text) Trace.info("npm_registry_info processed ok") # grab npm-stat_info today = date.today() month_ago = today - timedelta(30) npm_stat_info = json.loads(requests.get("http://npm-stat.com/downloads/range/" + date.strftime(month_ago, "%Y-%m-%d") + ":" + date.strftime(today, "%Y-%m-%d") + "/" + package_name).text) Trace.info("npm_stat_info processed ok") # build the doc and feed elasticsearch # _type first. _type will be the repo of the package. "no_repo" in case there is no repo. _type = "no_repo" if ("repository" in npm_registry_info and "type" in npm_registry_info["repository"]): _type = npm_registry_info["repository"]["type"].replace("/","_") # init document with versions document = { "versions": 0 } if "versions" in npm_registry_info: document["versions"] = len(npm_registry_info["versions"].keys()) # calculate downloads downloads = [0] if "downloads" in npm_stat_info and len(npm_stat_info["downloads"]) > 0: downloads = [item["downloads"] for item in npm_stat_info["downloads"]] document["average_downloads"] = reduce(lambda x, y: x + y, downloads) / len(downloads) # insert document Trace.info("about to upsert") Trace.info(json.dumps(self.elasticsearch.upsert_document(self._index, _type, _id, document))) Trace.info("upserted")
def __init__(self, test=False): "Inits the script" Trace.info("Starting" + (" ", " test")[test] +" script...") # change paths and indexes in case of test if test: # path of the files self.hotels_file = os.path.join(self.filedir,"./data/hotels_test.csv") self.comments_file = os.path.join(self.filedir,"./data/comments_test.csv") self.bitext_file = os.path.join(self.filedir,"./data/bitext_tuipilot_test.csv") # indexes self.hotels_index = "test_hotels" self.comments_index = "test_comments" self.bitext_index = "test_bitext" self.bitext_unique_index = "test_bitext_unique" self.bitext_unique_posneg_index = "test_bitext_unique_posneg" # hotels first self.build_hotels_index() # then comments self.build_comments_index() # then the rest self.build_bitext_indexes() Trace.info(("S", "Test s")[test] + "cript finished.")
def write(self, strings): "Write a list of strings" for string in strings: if not isinstance(string, basestring): Trace.error('Not a string: ' + unicode(string) + ' in ' + unicode(strings)) return self.writestring(string)
def readall(self): "Read the whole file" for encoding in FileConfig.parsing['encodings']: try: return self.readcodec(encoding) except UnicodeDecodeError: pass Trace.error('No suitable encoding for ' + self.filename) return []
def readall(self): "Read the whole file" for encoding in FileConfig.parsing["encodings"]: try: return self.readcodec(encoding) except UnicodeDecodeError: pass Trace.error("No suitable encoding for " + self.filename) return []
def dotseparated(self, number): "Get the number separated by dots: 1.1.3" dotsep = '' if len(number) == 0: Trace.error('Empty number') return '.' for piece in number: dotsep += '.' + unicode(piece) return dotsep[1:]
def removebackdirs(self): "Remove any occurrences of ../ (or ..\ on Windows)" self.path = os.path.normpath(self.path) backdir = '..' + os.path.sep while self.path.startswith(backdir): Trace.debug('Backdir in: ' + self.path) self.path = self.path[len(backdir):] while self.url.startswith('../'): Trace.debug('Backdir in: ' + self.url) self.url = self.url[len('../'):]
def gethtml(self, container): "Return the HTML code" html = [] if container.contents == None: return html for element in container.contents: if not hasattr(element, 'gethtml'): Trace.error('No html in ' + element.__class__.__name__ + ': ' + unicode(element)) return html html += element.gethtml() return html
def convert(self, filename, directory = ''): "Convert the filename adding the appropriate directories." if os.path.exists(filename): return filename newname = os.path.join(self.directory, filename) if os.path.exists(newname): return newname newname = os.path.join(directory, filename) if os.path.exists(newname): return newname Trace.error('Missing file ' + filename) return None
def convert(self, filename, directory=''): "Convert the filename adding the appropriate directories." if os.path.exists(filename): return filename newname = os.path.join(self.directory, filename) if os.path.exists(newname): return newname newname = os.path.join(directory, filename) if os.path.exists(newname): return newname Trace.error('Missing file ' + filename) return None
def increase(self, number): "Increase the number (or letter)" if not isinstance(number, str): return number + 1 if number == '-': index = 0 elif not number in NumberGenerator.letters: Trace.error('Unknown letter numeration ' + number) return 0 else: index = NumberGenerator.letters.index(number) + 1 return self.letter(index)
def readline(self): "Read a line from file" self.current = self.file.readline() if not isinstance(self.file, codecs.StreamReaderWriter): self.current = self.current.decode('utf-8') if len(self.current) == 0: self.depleted = True self.current = self.current.rstrip('\n\r') self.linenumber += 1 self.mustread = False Trace.prefix = 'Line ' + unicode(self.linenumber) + ': ' if self.linenumber % 1000 == 0: Trace.message('Parsing')
def generateordered(self, type): "Generate ordered numbering: a number to use and possibly concatenate " "with others. Example: Chapter 1, Section 1.5." level = self.getlevel(type) if level == 0: Trace.error('Impossible level 0 for ' + type) return '.' if len(self.number) >= level: self.number = self.number[:level] else: while len(self.number) < level: self.number.append(0) self.number[level - 1] = self.increase(self.number[level - 1]) return self.dotseparated(self.number)
def __init__(self, *argv): "Analyze the command line args and launch the Twitter location stream" southwest = None northeast = None output = sys.stdout # set KeyboardInterrupt signal handler signal.signal(signal.SIGINT, self.keyboard_interrupt_handler) #turn tuple into list argv = list(argv) # remove the first argument argv.pop(0) # iterate the list for argument in argv[:]: # look for southwest if argument == "-sw": try: southwest = argv[argv.index("-sw") + 1] except IndexError: self.usage() return argv.remove("-sw") argv.remove(southwest) # look for northeast if argument == "-ne": try: northeast = argv[argv.index("-ne") + 1] except IndexError: self.usage() return argv.remove("-ne") argv.remove(northeast) # check if argv was correct if southwest is None or northeast is None or len(argv) > 1: self.usage() return # check for output if len(argv) == 1: output = open(argv[0], "w") # launch the LocationStream self.twitterstream = LocationStream(southwest + "," + northeast) try: stream = self.twitterstream.start() Trace.message("Twitter stream started!!") Trace.message("Press ctrl+c to stop.") except: Trace.error("Raised exception: " + str(sys.exc_info()[0])) Trace.error("Stopping twitterstream") self.twitterstream.stop() return for line in stream: print >> output, line.strip()
def number(self, layout): "Set all attributes: number, entry, level..." if self.generator.isunique(layout): number = self.generator.generateunique(layout.type) self.setcommonattrs(layout, number) layout.anchortext = '' if layout.number != '': layout.anchortext = layout.entry + '.' return if not self.generator.isinordered(layout): Trace.error('Trying to number wrong ' + unicode(layout)) return # ordered or unordered if self.generator.isnumbered(layout): number = self.generator.generateordered(layout.type) else: number = self.generator.generateunique(layout.type) self.setcommonattrs(layout, number) layout.anchortext = layout.number layout.output.tag = layout.output.tag.replace('?', unicode(layout.level))
def build_comments_index(self): Trace.info("Building comments index...") # build the typemap comments_typemap = {"averageWebScore": int} comments_replace = [{"key":"commentId", "find":".", "replace":""}, {"key":"hotelSequence", "find":".", "replace":""}] # get the bulk of documents comments = CsvManager.read(self.comments_file, typemap=comments_typemap, replace=comments_replace) Trace.info(str(len(comments)) + " comments read") # bulk_upsert comments_upserted = self.elasticsearch.upsert_bulk(self.comments_index, "hotelSequence", "commentId", comments) Trace.info(str(comments_upserted) + " comments upserted in " + self.comments_index)
def build_hotels_index(self): Trace.info("Building hotels index...") # build the typemap hotels_keys = CsvManager.read_keys(self.hotels_file) hotels_typemap = dict(zip(hotels_keys[3:], [int]*len(hotels_keys[3:]))) hotels_replace = [{"key":"hotelSequence", "find":".", "replace":""}, {"key":"mailsEnviados", "find":".", "replace":""}] # get the bulk of documents hotels = CsvManager.read(self.hotels_file, typemap=hotels_typemap, replace=hotels_replace) Trace.info(str(len(hotels)) + " hotels read") # bulk_upsert hotels_upserted = self.elasticsearch.upsert_bulk(self.hotels_index, "destinationCode", "hotelSequence", hotels) Trace.info(str(hotels_upserted) + " hotels upserted in " + self.hotels_index)
def findtranslation(self): "Find the translation for the document language." self.langcodes = None if not self.language: Trace.error('No language in document') return if not self.language in TranslationConfig.languages: Trace.error('Unknown language ' + self.language) return if TranslationConfig.languages[self.language] == 'en': return langcodes = [TranslationConfig.languages[self.language]] try: self.translation = gettext.translation('elyxer', None, langcodes) except IOError: Trace.error('No translation for ' + unicode(langcodes))
def __init__(self, test=False, offset = 0): "Inits the script" global test_packages Trace.info("Starting" + (" ", " test")[test] +" script...") # change paths and indexes in case of test if test: test_packages_file = os.path.join(self.filedir,"./data/test_npm_package_names") test_packages = [item["test_package_name"] for item in CsvManager.read(test_packages_file)] self._index = "test_npm_packages" Trace.info("test_packages: " + json.dumps(test_packages)) # set offset self._offset = offset # build npm_packages_index self.build_npm_packages_index() Trace.info(("S", "Test s")[test] + "cript finished.")
def nextline(self): "Go to next line" if self.depleted: Trace.fatal('Read beyond file end') self.mustread = True
def build_bitext_indexes(self): "Builds bitext, bitext_unique and bitext_unique_posneg indexes" Trace.info("Building bitext, bitext_unique and bitext_unique_posneg indexes...") # typemap and replace bitext_replace = [{"key":"score", "find":",", "replace":"."}] bitext_typemap = {"score": float} # get the bulk of bitexts bitexts = CsvManager.read(self.bitext_file, typemap=bitext_typemap, replace=bitext_replace) # iterate the bulk of bitexts and insert the element in each of the indexes for _id,bitext_item in enumerate(bitexts): # add info from hotels hotel = self.elasticsearch.read_document(self.hotels_index, "_all", bitext_item["hotelSequence"]) if "found" in hotel and hotel["found"]: # add found hotel fields to bitext item bitext_item = dict(bitext_item.items() + hotel["_source"].items()) # upsert element bitext_type = bitext_item["section"] del bitext_item["section"] Trace.info("upserting bitext " + str(_id)) self.elasticsearch.upsert_document(self.bitext_index, bitext_type, str(_id), bitext_item) # update bitext_unique_posneg index previous_average_score = 0 previous_count = 0 previous_categories = "" separator = "" bitext_unique_posneg_id = bitext_item["commentId"] + bitext_type bitext_unique_posneg_item = self.elasticsearch.read_document(self.bitext_unique_posneg_index, "_all", bitext_unique_posneg_id) if "found" in bitext_unique_posneg_item and bitext_unique_posneg_item["found"]: previous_count = bitext_unique_posneg_item["_source"]["count"] previous_average_score = bitext_unique_posneg_item["_source"]["averageScore"] previous_categories = bitext_unique_posneg_item["_source"]["category"] separator = ", " bitext_unique_posneg_upsert_doc = { "section": bitext_type, "averageScore": 1.0*(previous_average_score*previous_count + bitext_item["score"])/(previous_count + 1), "count": previous_count + 1, "category": previous_categories + separator + bitext_item["category"] } # upsert self.elasticsearch.upsert_document(self.bitext_unique_posneg_index, bitext_item["hotelSequence"], bitext_unique_posneg_id, bitext_unique_posneg_upsert_doc) # update bitext_unique index previous_average_score = 0 previous_count = 0 previous_categories = "" separator = "" bitext_unique_id = bitext_item["commentId"] bitext_unique_item = self.elasticsearch.read_document(self.bitext_unique_index, "_all", bitext_unique_id) if "found" in bitext_unique_item and bitext_unique_item["found"]: previous_count = bitext_unique_item["_source"]["count"] previous_average_score = bitext_unique_item["_source"]["averageScore"] previous_categories = bitext_unique_item["_source"]["category"] separator = ", " bitext_unique_upsert_doc = { "averageScore": 1.0*(previous_average_score*previous_count + bitext_item["score"])/(previous_count + 1), "count": previous_count + 1, "category": previous_categories + separator + bitext_item["category"] } # look for the comment in the comment index comment = self.elasticsearch.read_document(self.comments_index, "_all", bitext_unique_id) if "found" in comment and comment["found"]: # add found comment averageWebScore to bitext unique item bitext_unique_upsert_doc["averageWebScore"] = comment["_source"]["averageWebScore"] bitext_unique_upsert_doc["scoresDiff"] = bitext_unique_upsert_doc["averageScore"] - bitext_unique_upsert_doc["averageWebScore"] bitext_unique_upsert_doc["scoresAbsDiff"] = math.fabs(bitext_unique_upsert_doc["scoresDiff"]) # upsert self.elasticsearch.upsert_document(self.bitext_unique_index, bitext_item["hotelSequence"], bitext_unique_id, bitext_unique_upsert_doc)
def usage(self): Trace.error('Usage: coalesce.py filein [fileout]') return
self.assertEquals(last_bitext["_source"]["score"], 2.0) self.assertEquals(last_bitext["_source"]["mailsEnviados"], 37) # test bitext_unique_posneg index bitext330956POS = self.elasticsearch.read_document("test_bitext_unique_posneg", "69559", "330956POS") self.assertTrue(bitext330956POS["found"]) self.assertEquals(bitext330956POS["_source"]["averageScore"], 2.0) # test bitext_unique index bitext330956 = self.elasticsearch.read_document("test_bitext_unique", "69559", "330956") self.assertTrue(bitext330956["found"]) self.assertEquals(bitext330956["_source"]["averageScore"], 2.0) self.assertEquals(bitext330956["_source"]["averageWebScore"], 5) self.assertEquals(bitext330956["_source"]["scoresDiff"], -3.0) self.assertEquals(bitext330956["_source"]["scoresAbsDiff"], 3.0) def tearDown(self): # delete indexes self.elasticsearch.remove_index("test_hotels") self.elasticsearch.remove_index("test_comments") self.elasticsearch.remove_index("test_bitext") self.elasticsearch.remove_index("test_bitext_unique_posneg") self.elasticsearch.remove_index("test_bitext_unique") if __name__ == '__main__': #unittest.main() if len(sys.argv)>1 and sys.argv[1] == "test": Trace.info("test") unittest.main(argv=sys.argv[:1], exit=True) else: Trace.info("main") _Main()
def keyboard_interrupt_handler(self, signal, frame): "handles KeyboardInterrupt signal" Trace.message("\nProcess interrupted by user. Exiting...") self.twitterstream.stop() sys.exit(0)
@unittest.skipIf(not(elasticsearch.is_up()), "irrelevant test if there is no elasticsearch instance") def test_script(self): global test_packages _Main(test = True) # count documents self.assertTrue(self.elasticsearch.count_documents("test_npm_packages") > 0) # assert express express_package = self.elasticsearch.read_document("test_npm_packages", "_all", "express") self.assertTrue(express_package["found"]) def tearDown(self): # delete indexes self.elasticsearch.remove_index("test_npm_packages") if __name__ == '__main__': #unittest.main() if len(sys.argv) > 1 and sys.argv[1] == "test": Trace.info("test") unittest.main(argv=sys.argv[:1], exit=True) else: if len(sys.argv) > 1: try: offset = int(sys.argv[1]) Trace.info("main with offset: " + str(offset)) _Main(offset = offset) except: pass else: Trace.info("main") _Main()
def usage(self): "Show command line help." Trace.error('Usage: twitterstream.py -sw 2.012,45.3232 -ne 3.119,48.8777 [fileout]') Trace.error('Launch a twitter stream client and send the result to an output') Trace.error('[fileout]: the file to dump the output. Stdout if omitted') Trace.error(' Parameters:') Trace.error(' --sw: longitude,latitude coordinates of the South West corner of the bounding box. Compulsory.') Trace.error(' --ne: longitude,latitude coordinates of the North East corner of the bounding box. Compulsory.') Trace.error('Example: python run_location_stream.py -sw -11.733398,35.763229 -ne 5.009766,42.970492')