def main(): ap = args.get_parser() ap.add_argument('--folder', type=str, help='the file folder') ap.add_argument('--c', dest='country', type=str, nargs='+', help='country list') ap.add_argument('--file', type=str, help='tweet file') ap.add_argument('--source', type=str, help='data source', default='datasift') arg = ap.parse_args() if arg.country is not None: country_list = arg.country else: country_list = COUNTRY if arg.file: for country in country_list: filter_by_userbelong(arg.file, country) elif arg.folder: files = os.listdir(arg.folder) for f in files: f = os.path.join(arg.folder, f) if os.path.isfile(f): for country in country_list: filter_by_userbelong(f, country)
def main(): ap = args.get_parser() ap.add_argument('--rev_file', metavar='FILE', type=str, required=False, help='File which stores last revision id.', default="last_rev_file.txt") arg = ap.parse_args() logs.init(arg) log.info("Run Started") api = API() recent_changes = api.get_recent_changes() # Filter to only revisions newer than last run last_rev_file = arg.rev_file if os.path.exists(last_rev_file): with open(last_rev_file) as f: last_rev = int(f.read()) recent_changes = [change for change in recent_changes if change['revid'] > last_rev] recently_changed_page_ids = set(str(change['pageid']) for change in recent_changes) latest_revisions = api.get_latest_revision(recently_changed_page_ids) print(len(latest_revisions)) print(latest_revisions) # TODO actually do something with revisions with open(last_rev_file, mode='w') as f: f.write(str(max([int(page['revisions'][0]['revid']) for page in latest_revisions.values()])))
def main(): global log ap = args.get_parser() ap.add_argument("--s_date", type=str, help="the start date to ingest: format mmddyyyy") ap.add_argument("--e_date", type=str, help="the end of date to ingest: format mmddyyyy") ap.add_argument("--o", type=str, help="the output directory") ap.add_argument("--region", type=str, help="the region of the web site") arg = ap.parse_args() logs.init(arg) t_format = "%m%d%Y" s_date = datetime.strptime(arg.s_date, t_format) e_date = datetime.strptime(arg.e_date, t_format) d_delta = (e_date - s_date).days seen_it = shelve.open("%s_reuters_news_seen_it.db" % (arg.region)) i = 0 while i <= d_delta: day_str = datetime.strftime(s_date + timedelta(days=i), t_format) print "Extracting %s" % (day_str) "write news to day file" with open("%s%s%s_ita_reuters_%s.txt" % (arg.o, os.sep, day_str, arg.region), "w") as w: daily_news = get_daily_news(day_str, seen_it, arg.region) for news in daily_news: w.write(json.dumps(news) + "\n") i += 1
def main(): ap = args.get_parser() ap.add_argument('--test', action="store_true", help="Test Flag, if contain this argument, it means a test case") arg = ap.parse_args() assert arg.sub, 'Need a queue to subscribe to' assert arg.pub, 'Need a queue to publish to' logs.init(arg) queue.init(arg) test_flag = arg.test conn = boto.connect_sdb() with queue.open(arg.sub, 'r') as inq: for m in inq: try: durationProcess(conn, m, arg.pub, test_flag) except KeyboardInterrupt: log.info('GOT SIGINT, exiting!') break except EmbersException as e: log.exception(e.value) except: log.exception("Unexpected exception in process")
def main(): """ Utility to cache messages from all queues from the --hostname provided with 'cache: true' option set in embers.conf --hostname : Cache all active queues on this host --log_file : Path to write the log file to --log_level : Logging level """ from etool import args global log arg_parser = args.get_parser() arg_parser.add_argument('--hostname', metavar='HOSTNAME', type=str, default=environ.get('HOSTNAME', None), help="The hostname of the machine whose services' data you wish to cache") arg = arg_parser.parse_args() log = logs.getLogger(log_name=arg.log_file) logs.init(arg, l=arg.log_level, logfile=arg.log_file) conf.init(arg) assert arg.hostname, '--hostname must be provided' queues = conf.get_all_cached_queues(hostname=arg.hostname) pool = [] for queue in queues: log.info('Spawning cache process for %s' % queue) p = multiprocessing.Process(name=queue, target=cache_queue, args=(queue,)) p.start() pool.append(p) try: for process in pool: process.join() log.warn('%s caching has stopped' % process.name) except KeyboardInterrupt: log.warn('Keyboard interrupt in main')
def parse_args(): ap = args.get_parser() ap.add_argument('-c', '--model_cfg', metavar="MODEL_CFG", default=os.path.join(os.path.dirname(__file__), "bayesian_model.conf"), type=str,nargs='?', help='the config file') ''' PORTS ''' ap.add_argument('-zs', '--surrogate_port', metavar="SURROGATE_PORT", default="tcp://*:30114", type=str, nargs="?", help="The zmq port") ap.add_argument('-zw', '--warning_port', metavar="WARNING_PORT", default="tcp://*:30115", type=str,nargs="?", help="The zmq port") '''DOMAINS''' ap.add_argument('--surrogate_domain', metavar="SURROGATE_DOMAIN", default="t_surrogatedata", type=str, nargs="?", help="The SimpleDB domain for storing surrogate data") ap.add_argument('--warning_domain', metavar="WARNING_DOMAIN", default="t_warningmessage", type=str, nargs="?", help="The SimpleDB domain for storing warning data") ''' Setting up time parameters to allow for running model in the past ''' utc_dt = T_UTC.localize(datetime.utcnow()) eas_dt = utc_dt.astimezone(T_EASTERN) default_day = datetime.strftime(eas_dt + timedelta(days =1),"%Y-%m-%d") ap.add_argument('--predict_date', metavar="PREDICT_DATE", type=str, default=default_day, nargs="?", help="The day to be predicted") ap.add_argument('--stock_list', metavar="Stock List", type=str, nargs="+", help="The list of stock to be predicted") ap.add_argument('--rege_date', metavar="REGE_DATE", type=str, help="The date need to be regerated") return ap.parse_args()
def main(): # Initialize arguments argparser = args.get_parser() argparser.add_argument('--json_file', help='JSON file to publish', required=True) arg = argparser.parse_args() queue.init(arg) writer = queue.open(arg.pub, 'pub', ssh_key=arg.ssh_key, ssh_conn=arg.tunnel) try: msg_reader = codecs.open(arg.json_file, encoding='utf-8', mode='r') message = msg_reader.readline() while message: writer.write(json.loads(message)) message = msg_reader.readline() msg_reader.close() except KeyboardInterrupt: pass return 0
def main(): ap = args.get_parser() ap.add_argument("--dir", type=str, help="directory of company member") ap.add_argument("--o", type=str, help="the directory of output ") arg = ap.parse_args() assert arg.dir, 'Need a dir to explor' rules = {} print arg os.chdir(arg.dir) for f in glob.glob('*.csv'): stock = f.split(".")[0].split("_")[1] country = COUNTRY_MARKET[stock] rules[country] = [] with open(f, 'r') as f_r: i = 0 for line in f_r: i += 1 if i >= 2: l = line.strip().split(",") company = l[2] if company == "": continue tmp = company.split(" ") if len(tmp) > 1: tmp = tmp[0:len(tmp) - 1] if company is not None: rules[country].append(company.strip()) rules[country].append(country) with open(arg.o, "w") as o_w: o_w.write(json.dumps(rules))
def main(): ap = args.get_parser() ap.add_argument('-i', '--inputFolder', type=str, help='inputFolder contaning twitter files', default='/hdd/tweets/2012/may') ap.add_argument('-s', '--scoresFolder', type=str, help='Folder contaning scoreCards', default='../data/scores/MX/') ap.add_argument('-cf', '--configFile', type=str, help='election configuration file', default='../configFiles/electionConfig_MX') ap.add_argument('-d1', '--fromDate', type=str, help='fromDate') ap.add_argument('-d2', '--toDate', type=str, help='toDate') ap.add_argument('-f1', '--flag1', help="countOrPredict", type=str, default='2') ap.add_argument('-r', '--regression', help="regressionType", type=str, default='LASSO') ap.add_argument('-f2', '--flag2', help="flag to push surrogates and warning to S3", type=str, default='0') arg = ap.parse_args() logs.init(arg) try: elections = Elections(arg.inputFolder, arg.scoresFolder, arg.configFile, arg.fromDate, arg.toDate) log.info("Election class initialized") except Exception as e: log.exception("exception during intialization: %s. Quitting!!", e) try: if (arg.flag1 == '1' or arg.flag1 == '3'): elections.collectMentions() except Exception as e: log.exception("error while tracking tweets") try: if (arg.flag1 == '2' or arg.flag1 == '3'): winner, winningScore, runnerUp, runnerUpScore, finalScore = elections.getWinner(arg.fromDate, arg.toDate, arg.regression) print "------------Regression Results-----------" print finalScore print winner + "====>" + str(winningScore) print "-----------------------------------------" except Exception as e: log.exception("error while calculating winner:%s", e) try: elections.createSurrogate(winner, winningScore, runnerUp, runnerUpScore, arg.flag2) except Exception as e: log.exception("error during creating warnings") try: if (arg.flag2 == '1'): elections.storeStatistics(arg.fromDate, arg.toDate) except Exception as e: log.exception("error in storing statistics:%s", e) log.info("ALL Operations Complete")
def main(): # Initialize arguments argparser = args.get_parser() argparser.add_argument('--local_port', help='Local port to connect to java server', required=True) arg = argparser.parse_args() localPort = int(arg.local_port) # Initialize log logs.init(arg) global log # Initialize the queue with arguments and connect to the specified feed log.info("Opening and connecting to queue %s", arg.sub) queue.init(arg) reader = queue.open(arg.sub, 'sub', ssh_key=arg.ssh_key, ssh_conn=arg.tunnel) # Initialize the writer to publish to a queue log.info("Publishing to queue %s", arg.pub) writer = queue.open(arg.pub, 'pub', ssh_key=arg.ssh_key, ssh_conn=arg.tunnel) count = 0 # Connect to Java server while True: for feedmsg in reader: try: while True: try: sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) sock.connect(("localhost", localPort)) break except: log.info("Unable to connect to local server") log.debug("Connected to java server on port %d" % localPort) socketLines = sock.makefile() # Clean the message to fix irregularities feedmsg = message.clean(feedmsg) log.debug("Read message %d. Sending to java" % count) # Write message to socket stream sock.sendall(json.dumps(feedmsg)) sock.sendall('\n') # Receive result from socket stream result = socketLines.readline() writer.write(json.dumps(result)) count += 1 sock.close() except KeyboardInterrupt: sys.exit(1) else: log.info("Server was disconnected.")
def parse_arg(): ap = args.get_parser() ap.add_argument('--cat', action="store_true", help="Flag: input from stdin") ap.add_argument('--user', dest='user_file', type=str, help='User file') ap.add_argument('--folder', type=str, help="tweets folder") ap.add_argument('--month', type=str) arg = ap.parse_args() return arg
def main(): ap = args.get_parser() ap.add_argument("--da", help="the updated news file for analyze") ap.add_argument("--k", type=int, help="the number of topic") arg = ap.parse_args() assert arg.da, "Please input a news file" assert arg.k, "Please input the number of topics" compute_topic_daily(arg.da, arg.k)
def main(): ''' Reads the from the queue, retrieves the content from the source website and publishes the content to a new queue. ''' ap = args.get_parser() ap.add_argument( '--cat', action="store_true", help='Read input from standard in and write to standard out.') arg = ap.parse_args() logs.init(arg) geo_mena = GeoMena() geo_lac = Geo(geo_region=GEO_REGION.lac) try: if arg.cat: log.debug('Reading from stdin and writing to stdout.') ins = sys.stdin outs = sys.stdout for entry in ins: entry = entry.decode(encoding='utf-8') try: tweet = json.loads(entry.strip()) geo_annotate(tweet, geo_mena, geo_lac) if tweet is not None: outs.write( json.dumps(tweet, ensure_ascii=False).encode("utf-8")) outs.write('\n') outs.flush() except Exception: log.exception('Failed to process message "%s".', (entry, )) else: queue.init(arg) with queue.open(arg.sub, 'r') as inq: with queue.open(arg.pub, 'w', capture=True) as outq: for tweet in inq: try: content = geo_annotate(tweet, geo_mena, geo_lac) if content is not None: outq.write(content) except KeyboardInterrupt: log.info("Got SIGINT, exiting.") break except Exception: log.exception('Failed to process message "%s".', (tweet, )) return 0 except Exception as e: log.exception("Unknown error in main function-{}".format(str(e))) return 1
def main(): ap = args.get_parser() ap.add_argument('--df', help='The datafile to train') ap.add_argument('--od', help='the directory to store output') arg = ap.parse_args() lda = Lda() lda.doc_process(arg.df) lda.out_put_wordmap(arg.od) lda.estimate() lda.out_put_result(arg.od)
def main(): ''' Reads the from the queue, retrieves the content from the source website and publishes the content to a new queue. ''' ap = args.get_parser() ap.add_argument('--cat', action="store_true", help='Read input from standard in and write to standard out.') ap.add_argument('--region', metavar='REGION', type=str, default=None, help='Specify region to filter by') arg = ap.parse_args() logs.init(arg) filter_region = arg.region geoc = GeoCountry() try: if arg.cat: log.debug('Reading from stdin and writing to stdout.') ins = sys.stdin outs = codecs.getwriter('utf-8')(sys.stdout) for entry in ins: entry = entry.decode(encoding='utf-8') try: tweet = json.loads(entry.strip()) tweet = annotate(tweet, geoc, filter_region) if tweet is not None: outs.write(json.dumps(tweet, ensure_ascii=False)) outs.write('\n') outs.flush() except Exception: log.exception('Failed to process message "%s".', entry) else: queue.init(arg) iqueue.init(arg) qname = "{}-geoCountry-{}".format(os.environ["CLUSTERNAME"], filter_region) with iqueue.open(arg.sub, 'r', qname=qname) as inq: with queue.open(arg.pub, 'w') as outq: # , capture=True) as outq: for tweet in inq: try: content = annotate(tweet, geoc, filter_region) if content is not None: outq.write(content) except KeyboardInterrupt: log.info("Got SIGINT, exiting.") break except Exception: log.exception('Failed to process message "%s".', tweet) return 0 except Exception as e: log.exception("Unknown error in main function-{0!s}.".format(e)) return 1
def parse_args(): T_UTC = pytz.utc T_EASTERN = pytz.timezone("US/Eastern") ap = args.get_parser() # ap.add_argument('-f',dest="bloomberg_price_file",metavar="STOCK PRICE",type=str,help='The stock price file') # read stdin, write stdout ap.add_argument('-t',dest="trend_file",metavar="TREND RANGE FILE",default="./trendRange.json", type=str,nargs='?',help="The trend range file") utc_dt = T_UTC.localize(datetime.utcnow()) eas_dt = utc_dt.astimezone(T_EASTERN) default_day = datetime.strftime(eas_dt, "%Y-%m-%d") ap.add_argument('-d', dest="operate_date", metavar="OPERATE DATE", type=str, default=default_day, nargs="?", help="The day to be processed") ap.add_argument('-sd', dest="start_date", metavar="START OPERATE DATE", type=str, nargs="?", help="The day to be processed") ap.add_argument('-ed', dest="end_date", metavar="END OPERATE DATE", type=str, nargs="?", help="The day to be processed") return ap.parse_args()
def main(): ap = args.get_parser() ap.add_argument('--replay', action="store_true", help="Test Flag, if contain this argument, it means a test case") #if the rule file is not indicated in argument, it need to be load from sys.stdin ap.add_argument('--rulefile', type=str, help="The rule file for duration analysis model") arg = ap.parse_args() if not arg.replay: assert arg.sub, 'Need a queue to subscribe to' assert arg.pub, 'Need a queue to publish to' logs.init(arg) queue.init(arg) test_flag = arg.replay if arg.rulefile: rule = eval(open(arg.rulefile).read()) else: #load the rules from sys.stdin rule = eval(sys.stdin.read()) conn = boto.connect_sdb() if not arg.replay: with queue.open(arg.sub, 'r') as inq: for m in inq: try: replayIO = StringIO.StringIO() durationProcess(rule, conn, m, arg.pub, test_flag, replayIO) except KeyboardInterrupt: log.info('GOT SIGINT, exiting!') break except EmbersException as e: log.exception(e.value) except: log.exception("Unexpected exception in process") else: #replay model take enriched file as input enrich_messages = sys.stdin.readlines() for m in enrich_messages: m = json.loads(m.strip()) try: replayIO = StringIO.StringIO() durationProcess(rule, conn, m, arg.pub, test_flag, replayIO) except KeyboardInterrupt: log.info('GOT SIGINT, exiting!') break except EmbersException as e: log.exception(e.value) except: log.exception("Unexpected exception in process")
def main(): ap = args.get_parser() ap.add_argument('--filedir', type=str, help="analysis files") ap.add_argument('--window', type=int, default=7) ap.add_argument('--result', type=str, help="result") arg = ap.parse_args() start_date = "2012-12-08" end_date = "2013-05-31" dates = date_seed(start_date, end_date) for d in dates: detector = Detector(d, arg.window, arg.filedir, arg.result) detector.load_files() detector.detect()
def main(): ap = args.get_parser() ap.add_argument('--o', type=str, help="the output dir to store news") arg = ap.parse_args() assert arg.o, 'Need a dir to store news' logs.init(arg) locale.setlocale(locale.LC_TIME, 'es_ES.utf-8') seen_it = shelve.open('elfinance_seen_it.db') cas = ['finanzas'] for ca in cas: get_category_news(ca, seen_it, arg.o)
def main(): ''' Reads the from the queue, retrieves the content from the source website and publishes the content to a new queue. ''' ap = args.get_parser() ap.add_argument('--cat', action="store_true", help='Read input from standard in and write to standard out.') arg = ap.parse_args() logs.init(arg) geo_mena = GeoMena() geo_lac = Geo(geo_region=GEO_REGION.lac) try: if arg.cat: log.debug('Reading from stdin and writing to stdout.') ins = sys.stdin outs = sys.stdout for entry in ins: entry = entry.decode(encoding='utf-8') try: tweet = json.loads(entry.strip()) geo_annotate(tweet, geo_mena, geo_lac) if tweet is not None: outs.write(json.dumps(tweet, ensure_ascii=False).encode("utf-8")) outs.write('\n') outs.flush() except Exception: log.exception('Failed to process message "%s".', (entry,)) else: queue.init(arg) with queue.open(arg.sub, 'r') as inq: with queue.open(arg.pub, 'w', capture=True) as outq: for tweet in inq: try: content = geo_annotate(tweet, geo_mena, geo_lac) if content is not None: outq.write(content) except KeyboardInterrupt: log.info("Got SIGINT, exiting.") break except Exception: log.exception('Failed to process message "%s".', (tweet,)) return 0 except Exception as e: log.exception("Unknown error in main function-{}".format(str(e))) return 1
def main(): """ Utility for warnings stored in Elasticsearch --log_file : Path to write the log file to --log_level : Logging level """ from etool import args global log arg_parser = args.get_parser() arg = arg_parser.parse_args() log = logs.getLogger(log_name=arg.log_file) logs.init(arg, l=arg.log_level, logfile=arg.log_file) print(query(max_results=30))
def main(): ap = args.get_parser() ap.add_argument('--f', type=str, help='the newes file') arg = ap.parse_args() assert arg.f, 'Need a file to ingest' assert arg.pub, 'Need a queue to publish' logs.init(arg) queue.init(arg) with queue.open(arg.pub, 'w') as q_w, open(arg.f, 'r') as f_r: for line in f_r: news = json.loads(line) q_w.write(news)
def main(): ap = args.get_parser() ap.add_argument('--fd', help='the directory of the files needed to processed') ap.add_argument('--f', help='the file need to be prcessed') ap.add_argument('--o', help='the output file name') arg = ap.parse_args() assert arg.o, 'Need a output file' with codecs.open(arg.o, encoding='ascii', mode="w") as out_f: if arg.fd: process(arg.fd, out_f) elif arg.f: transfer2ldaf(arg.f, out_f) else: pass
def main(): NET_TYPE = {"c": comprehend_network, "u": user2user_network, "t": content_based_network, "e": entity_network, "r": entity_corr_network} ap = args.get_parser() ap.add_argument('--out', type=str, help='graph output folder', default='./') ap.add_argument('--inf', type=str, help='tweet input folder') ap.add_argument('--infiles', type=str, nargs='+', help='list of files to be handled') ap.add_argument('--c', type=str, nargs='+', help='list of country') ap.add_argument('--dirf', type=str, help='The folder directly to he handled') ap.add_argument('--net', type=str, help="type of network,each symbol represent each type:c") ap.add_argument('--thre', type=float, default=0.1, help="threshold for corr") arg = ap.parse_args() assert arg.net, "Please input a network type" global threshold threshold = arg.thre if arg.c and len(arg.c) > 0: country_list = arg.c else: country_list = COUNTRY if arg.inf: for country in country_list: in_folder = os.path.join(arg.inf, country.replace(" ", "")) out_folder = os.path.join(arg.out, "graph") out_folder = os.path.join(out_folder, country.replace(" ", "")) for t in arg.net: net_type = NET_TYPE.get(t) handle_by_folder(in_folder, out_folder, country, net_type) elif arg.infiles: for f in arg.infiles: for t in arg.net: net_type = NET_TYPE.get(t) handle_by_file(arg.out, f, country_list[0], net_type) elif arg.dirf: for t in arg.net: net_type = NET_TYPE.get(t) handle_by_folder(arg.dirf, arg.out, country_list[0], net_type)
def main(): ap = args.get_parser() ap.add_argument('--db', help="the path of sqlite db") ap.add_argument('--ts', type=str, nargs='+', help="the list of tickers") ap.add_argument('--merge', type=str, nargs='+', help='merge files with same date') arg = ap.parse_args() ts = arg.ts db = arg.db m_list = arg.merge if m_list: merg_data(m_list) if ts and db: conn = lite.connect(arg.db) ts = arg.ts get_data(conn, ts)
def main(): ap = args.get_parser() ap.add_argument('--out', help="the output file of warnings") arg = ap.parse_args() assert arg.sub, 'Need a queue to subcribe!' assert arg.out, 'Need a file to store warnings!' logs.init(arg) queue.init(arg) out_file = arg.out with queue.open(arg.sub, 'r') as q_r: for m in q_r: with open(out_file, "a") as out_w: if not check_ifexist(m): out_w.write(json.dumps(m) + "\n") else: print "Duplicated Warnings"
def main(): """ Utility to cache messages from all queues from the --hostname provided with 'cache: true' option set in embers.conf --hostname : Cache all active queues on this host --log_file : Path to write the log file to --log_level : Logging level """ from etool import args global log arg_parser = args.get_parser() arg_parser.add_argument( '--hostname', metavar='HOSTNAME', type=str, default=environ.get('HOSTNAME', None), help= "The hostname of the machine whose services' data you wish to cache") arg = arg_parser.parse_args() log = logs.getLogger(log_name=arg.log_file) logs.init(arg, l=arg.log_level, logfile=arg.log_file) conf.init(arg) assert arg.hostname, '--hostname must be provided' queues = conf.get_all_cached_queues(hostname=arg.hostname) pool = [] for queue in queues: log.info('Spawning cache process for %s' % queue) p = multiprocessing.Process(name=queue, target=cache_queue, args=(queue, )) p.start() pool.append(p) try: for process in pool: process.join() log.warn('%s caching has stopped' % process.name) except KeyboardInterrupt: log.warn('Keyboard interrupt in main')
def main(): ap = args.get_parser() ap.add_argument('-i', '--input', default='sys.stdin', type=str, help='Path to the input file.' 'Default is sys.stdin') ap.add_argument('-o', '--out', default='sys.stdout', type=str, help='Path to the output file.' 'Default is sys.stdout') ap.add_argument('searchPhrase', default='config/phrases.txt', type=str, help='Path to ' 'the Phrase File if "-f" flag is specified, else the input string is considered' 'to be the phrase.') ap.add_argument('-f', '--file', action='store_true', default=False, help='If given, then the ' 'the searchPhrase argument is interpreted as path to a file') global logger logger = logs.getLogger("%s-%s.log" % (__processor__, str(datetime.now()))) arg = ap.parse_args() logs.init(args) inputFile = None outFile = None phraseFile = None if arg.input == 'sys.stdin': reader = codecs.getreader('utf-8')(sys.stdin) else: inputFile = open(arg.input, "r") reader = codecs.getreader('utf-8')(inputFile) if arg.out == 'sys.stdout': writer = codecs.getwriter('utf-8')(sys.stdout) else: outFile = codecs.open(arg.out, "w", encoding="utf-8") writer = codecs.getwriter('utf-8')(outFile) if arg.file: phraseFile = codecs.open(arg.searchPhrase, encoding='utf-8') generatePhraseList(phraseFile.readlines()) else: generatePhraseList([arg.searchPhrase]) phraseSearch(reader, writer) #close all files if inputFile: inputFile.close() if outFile: outFile.close() if phraseFile: phraseFile.close()
def main(): svm_twitter = SVM_Twitter(0.1, 0.1, 'rbf') ap = args.get_parser() ap.add_argument("--pca_num", default=8, type=int) ap.add_argument("--net", type=str) ap.add_argument("--k", type=int) ap.add_argument("--inf", type=str, help="input folder") ap.add_argument("--o_surr", type=str, help="output surrogate file") arg = ap.parse_args() folder = { "t": "content", "c": "comprehend", "u": "user2user", "e": "entity" } assert arg.pub, "Please input a queue to publish surrogate" queue.init(arg) send_queue = queue.open(arg.pub, "w") surr_w = open(arg.o_surr, "w") for country in COUNTRY: train_file = os.path.join( arg.inf, "%s_train_%d" % (country.replace(" ", ""), arg.k)) test_file = os.path.join( arg.inf, "%s_test_%d" % (country.replace(" ", ""), arg.k)) svm_twitter.load_data(train_file, test_file) svm_twitter.normalize() #svm_twitter.normalize() #svm_twitter.pca(arg.pca_num) svm_twitter.fit() svm_twitter.predict() for day in svm_twitter.novel_days: surrogate = {"country": country, "date": day.strftime("%Y-%m-%d")} send_queue.write(surrogate) surr_w.write(json.dumps(surrogate) + "\n") print "prediction result: %s " % country print[day.strftime("%Y-%m-%d") for day in svm_twitter.novel_days] surr_w.flush() surr_w.close() send_queue.close()
def main(): """ Utility to set up a mapping for an EMBERS queue in Elasticsearch -q | --queue : Queue name to set up the mapping for. Settings are read from embers.conf --log_file : Path to write the log file to --log_level : Logging level """ from etool import args global log arg_parser = args.get_parser() arg_parser.add_argument('-q', '--queue', help='Queue name to map into Elasticsearch') arg = arg_parser.parse_args() assert arg.queue, '--queue must be provided' log = logs.getLogger(log_name=arg.log_file) logs.init(arg, l=arg.log_level, logfile=arg.log_file) add_type(index_name=general.get_index_name(), type_name=arg.queue)
def main(): ap = args.get_parser() default_day = datetime.strftime(datetime.now(), "%Y-%m-%d") ap.add_argument("--d", type=str, default=default_day, help="The day to ingest, Format: dd/mm/yyyy") ap.add_argument("--domain", default="bloomberg_prices", help="The simpleDB table to store raw data") arg = ap.parse_args() assert arg.pub, "Need a queue to publish" logs.init(arg) queue.init(arg) with queue.open(arg.pub, "w") as out_q: for stock in STOCK_CON: if stock == "COLCAP": scrape_f = scrape_colcap_url if stock == "CHILE65": scrape_f = scrape_chile65_url msg = ingest_price(arg, stock, scrape_f) if msg is not None: out_q.write(msg) store(arg, msg)
def main(): ap = args.get_parser() ap.add_argument('-c', '--conf', metavar='CONF', type=str, nargs='?', default=os.path.join(os.path.dirname(__file__), 'bloomberg_news_ingest.conf'), help='The location of the configuration file.') arg = ap.parse_args() assert arg.pub, "--pub required. Need a queue to publish on" logs.init(arg) conf = get_conf(arg.conf) seen_it = shelve.open("bloomberg_news_seen_it.db") try: with queue.open(arg.pub, 'w', capture=True) as outq: for (index, companies) in conf.items(): for company in companies: articles = get_stock_news(index, company, seen_it) for a in articles: outq.write(a) except KeyboardInterrupt: log.info('GOT SIGINT, exiting')
def main(): ap = args.get_parser() ap.add_argument('--dir') arg = ap.parse_args() assert arg.pub, "Enter a queue to pub" file_folder = arg.dir files = os.listdir(file_folder) w_queue = queue.open(arg.pub, "w", capture=True) for f in files: full_f = os.path.join(file_folder, f) with open(full_f) as af: for d_ana in af: temp = d_ana.strip().split("|") message = {"country": temp[1], "date": temp[0], "z_value": temp[2], "diff_mag": temp[3]} w_queue.write(message) w_queue.close()
def main(): svm_twitter = SVM_Twitter(0.1, 0.1, 'rbf') ap = args.get_parser() ap.add_argument("--pca_num", default=8, type=int) ap.add_argument("--net", type=str) ap.add_argument("--k", type=int) ap.add_argument("--inf", type=str, help="input folder") ap.add_argument("--o_surr", type=str, help="output surrogate file") arg = ap.parse_args() folder = {"t": "content", "c": "comprehend", "u": "user2user", "e": "entity"} assert arg.pub, "Please input a queue to publish surrogate" queue.init(arg) send_queue = queue.open(arg.pub, "w") surr_w = open(arg.o_surr, "w") for country in COUNTRY: train_file = os.path.join(arg.inf, "%s_train_%d" % (country.replace(" ", ""), arg.k)) test_file = os.path.join(arg.inf, "%s_test_%d" % (country.replace(" ", ""), arg.k)) svm_twitter.load_data(train_file, test_file) svm_twitter.normalize() #svm_twitter.normalize() #svm_twitter.pca(arg.pca_num) svm_twitter.fit() svm_twitter.predict() for day in svm_twitter.novel_days: surrogate = {"country": country, "date": day.strftime("%Y-%m-%d")} send_queue.write(surrogate) surr_w.write(json.dumps(surrogate)+ "\n") print "prediction result: %s " % country print [day.strftime("%Y-%m-%d") for day in svm_twitter.novel_days] surr_w.flush() surr_w.close() send_queue.close()
def main(): ap = args.get_parser() ap.add_argument('--level', type=str, default="0.6", help='The threhold') ap.add_argument('--svm', action='store_true') ap.add_argument('--zmq', action='store_true') ap.add_argument('--surr', type=str, help="surrogate file") ap.add_argument('--warn', type=str, help="warning file") arg = ap.parse_args() logs.init(arg) queue.init(arg) assert arg.pub, "Please input a queue to publish warning" if arg.zmq: assert arg.sub, "Please input a queue to sub surrogate message" conn = boto.connect_sdb() t_domain = get_domain(conn, "s_holiday") if arg.zmq: with queue.open(arg.sub, 'r') as inq: for m in inq: try: if arg.svm: svm_warning(t_domain, m, arg.pub) else: warning_center(t_domain, m, arg.pub, float(arg.level)) except KeyboardInterrupt: log.info('GOT SIGINIT, exiting!') break except: log.exception("Exception in Process:%s" % sys.exc_info()[0]) else: with open(arg.warn, "w") as w, open(arg.surr) as r: if arg.svm: for m in r: m = json.loads(m) warning = svm_warning(t_domain, m, arg.pub) w.write(json.dumps(warning) + "\n")
def main(): ap = args.get_parser() ap.add_argument("--out", type=str, help="the output dir") ap.add_argument("--inf", type=str, help="graph files folder") ap.add_argument("--files", type=str, nargs='+', help="file list") ap.add_argument("--net", type=str) ap.add_argument("--c_folder", type=str) arg = ap.parse_args() assert arg.out, "Please Enter a output dir" if arg.inf: folders = os.listdir(arg.inf) for folder in folders: full_f = os.path.join(arg.inf, folder) if not os.path.isdir(full_f): continue analysis_by_folder(full_f, arg.out, arg.net) elif arg.files: for f in arg.files: analysis_by_file(f, arg.out, arg.net) elif arg.c_folder: analysis_by_folder(arg.c_folder, arg.out, arg.net)
def main(): ''' Reads the from the queue, retrieves the content from the source website and publishes the content to a new queue. ''' ap = args.get_parser() ap.add_argument( '--cat', action="store_true", help='Read input from standard in and write to standard out.') ap.add_argument('--region', metavar='REGION', type=str, default=None, help='Specify region to filter by') arg = ap.parse_args() logs.init(arg) filter_region = arg.region geoc = GeoCountry() try: if arg.cat: log.debug('Reading from stdin and writing to stdout.') ins = sys.stdin outs = codecs.getwriter('utf-8')(sys.stdout) for entry in ins: entry = entry.decode(encoding='utf-8') try: tweet = json.loads(entry.strip()) tweet = annotate(tweet, geoc, filter_region) if tweet is not None: outs.write(json.dumps(tweet, ensure_ascii=False)) outs.write('\n') outs.flush() except Exception: log.exception('Failed to process message "%s".', entry) else: queue.init(arg) iqueue.init(arg) qname = "{}-geoCountry-{}".format(os.environ["CLUSTERNAME"], filter_region) with iqueue.open(arg.sub, 'r', qname=qname) as inq: with queue.open(arg.pub, 'w') as outq: # , capture=True) as outq: for tweet in inq: try: content = annotate(tweet, geoc, filter_region) if content is not None: outq.write(content) except KeyboardInterrupt: log.info("Got SIGINT, exiting.") break except Exception: log.exception('Failed to process message "%s".', tweet) return 0 except Exception as e: log.exception("Unknown error in main function-{0!s}.".format(e)) return 1
def main(): """ Utility to cache messages from a queue into Elasticsearch -q | --queue : Read from <queue> and write the messages to Elasticsearch. Settings are read from embers.conf --log_file : Path to write the log file to --log_level : Logging level """ from etool import args global log arg_parser = args.get_parser() arg_parser.add_argument('-q', '--queue', help='Queue name to index into Elasticsearch') arg_parser.add_argument( '-s', '--s3fromq', action='store_true', help='ingest from S3 prefix derived from queue name') arg_parser.add_argument('-p', '--prefix', help='Ingest from prefix') #arg_parser.add_argument('-t', '--typename', default='noqueue', help='Type for prefix ingest') arg_parser.add_argument('-t', '--typename', help='Type for prefix ingest') arg_parser.add_argument( '-l', '--tmpcopy', default='/home/embers/data/tmpcopy', help='Name of local copy of S3 file (same for all S3 files)') arg_parser.add_argument('-c', '--chunk', type=int, default=100, help='Chunk size for S3 ingest') arg_parser.add_argument('-i', '--clustername', help='Clustername to determine index name') arg_parser.add_argument( '-w', '--withbase', action="store_true", help="Add basename to prefix when looking for type.") arg_parser.add_argument('--startdate', help='start date in format like 2015-01-02') arg_parser.add_argument('--enddate', help='end date in format like 2015-01-02') arg = arg_parser.parse_args() #assert (arg.queue or (arg.prefix and arg.typename)), 'Either --queue (with optional --s3fromq/--typename) or --prefix with --typename must be provided' assert ( arg.queue or arg.prefix ), 'Either --queue (with optional --s3fromq/--typename) or --prefix must be provided' log = logs.getLogger(log_name=arg.log_file) logs.init(arg, l=arg.log_level, logfile=arg.log_file) index_name = general.get_index_name(arg.clustername) queue.init() if arg.prefix or (arg.queue and arg.s3fromq): if arg.prefix: prefix = arg.prefix # get queue name or its substitute for S3 objects from prefix if arg.typename: type_name = arg.typename else: type_name = queue.conf.get_prefixpair( prefix=prefix, includeS3=True, withBasename=arg.withbase) if not type_name: log.error("Could not get type from prefix %s" % prefix) return 1 log.warning("type_name=%s from prefix=%s" % (type_name, prefix)) else: type_name = arg.queue prefix, include = queue.conf.get_prefix_for_queue( type_name, withBasename=False) if not prefix: log.error("Could not get S3 prefix for queue %s" % type_name) return 1 if not general.get_es_connection().indices.exists_type( index=index_name, doc_type=type_name): # Create mapping if the queue has not been stored in Elasticsearch yet index_setup.add_type(index_name=index_name, type_name=type_name) conn_s3 = boto.connect_s3(aws_access_key_id=arg.aws_key, aws_secret_access_key=arg.aws_secret) bucket = conn_s3.get_bucket( arg.bucket) # connect to S3, get bucket ptr for arg.bucket attach_to_s3(index_name, s3prefix=prefix, bucket=bucket, type_name=type_name, tmpcopy=arg.tmpcopy, chunk_size=arg.chunk, startdate=arg.startdate, enddate=arg.enddate) else: if arg.typename: type_name = arg.typename else: type_name = arg.queue if not general.get_es_connection().indices.exists_type( index=index_name, doc_type=type_name): # Create mapping if the queue has not been stored in Elasticsearch yet index_setup.add_type(index_name=index_name, type_name=type_name) attach_to_queue(index_name=index_name, queue_name=arg.queue, type_name=type_name)