def main():
    ap = args.get_parser()
    ap.add_argument('--folder', type=str, help='the file folder')
    ap.add_argument('--c', dest='country', type=str, nargs='+',
                    help='country list')
    ap.add_argument('--file', type=str, help='tweet file')
    ap.add_argument('--source', type=str, help='data source',
                    default='datasift')
    arg = ap.parse_args()

    if arg.country is not None:
        country_list = arg.country
    else:
        country_list = COUNTRY

    if arg.file:
        for country in country_list:
            filter_by_userbelong(arg.file, country)
    elif arg.folder:
        files = os.listdir(arg.folder)
        for f in files:
            f = os.path.join(arg.folder, f)
            if os.path.isfile(f):
                for country in country_list:
                    filter_by_userbelong(f, country)
示例#2
0
def main():
    ap = args.get_parser()
    ap.add_argument('--rev_file', metavar='FILE', type=str, required=False,
                     help='File which stores last revision id.', default="last_rev_file.txt")
    arg = ap.parse_args()
    logs.init(arg)
    log.info("Run Started")

    api = API()
    recent_changes = api.get_recent_changes()

    # Filter to only revisions newer than last run
    last_rev_file = arg.rev_file
    if os.path.exists(last_rev_file):
        with open(last_rev_file) as f:
            last_rev = int(f.read())
            recent_changes = [change for change in recent_changes if change['revid'] > last_rev]

    recently_changed_page_ids = set(str(change['pageid']) for change in recent_changes)
    latest_revisions = api.get_latest_revision(recently_changed_page_ids)
    print(len(latest_revisions))
    print(latest_revisions)
    # TODO actually do something with revisions
    with open(last_rev_file, mode='w') as f:
        f.write(str(max([int(page['revisions'][0]['revid']) for page in latest_revisions.values()])))
示例#3
0
def main():
    global log
    ap = args.get_parser()
    ap.add_argument("--s_date", type=str, help="the start date to ingest: format mmddyyyy")
    ap.add_argument("--e_date", type=str, help="the end of date to ingest: format mmddyyyy")
    ap.add_argument("--o", type=str, help="the output directory")
    ap.add_argument("--region", type=str, help="the region of the web site")
    arg = ap.parse_args()
    logs.init(arg)

    t_format = "%m%d%Y"
    s_date = datetime.strptime(arg.s_date, t_format)
    e_date = datetime.strptime(arg.e_date, t_format)
    d_delta = (e_date - s_date).days

    seen_it = shelve.open("%s_reuters_news_seen_it.db" % (arg.region))
    i = 0
    while i <= d_delta:
        day_str = datetime.strftime(s_date + timedelta(days=i), t_format)
        print "Extracting %s" % (day_str)
        "write news to day file"
        with open("%s%s%s_ita_reuters_%s.txt" % (arg.o, os.sep, day_str, arg.region), "w") as w:
            daily_news = get_daily_news(day_str, seen_it, arg.region)
            for news in daily_news:
                w.write(json.dumps(news) + "\n")
        i += 1
示例#4
0
def main():
    ap = args.get_parser()
    ap.add_argument('--test', action="store_true", help="Test Flag, if contain this argument, it means a test case")
    arg = ap.parse_args()

    assert arg.sub, 'Need a queue to subscribe to'
    assert arg.pub, 'Need a queue to publish to'

    logs.init(arg)
    queue.init(arg)
    test_flag = arg.test

    conn = boto.connect_sdb()

    with queue.open(arg.sub, 'r') as inq:
        for m in inq:
            try:
                durationProcess(conn, m, arg.pub, test_flag)
            except KeyboardInterrupt:
                log.info('GOT SIGINT, exiting!')
                break
            except EmbersException as e:
                log.exception(e.value)
            except:
                log.exception("Unexpected exception in process")
示例#5
0
def main():
    ap = args.get_parser()
    ap.add_argument('--folder', type=str, help='the file folder')
    ap.add_argument('--c',
                    dest='country',
                    type=str,
                    nargs='+',
                    help='country list')
    ap.add_argument('--file', type=str, help='tweet file')
    ap.add_argument('--source',
                    type=str,
                    help='data source',
                    default='datasift')
    arg = ap.parse_args()

    if arg.country is not None:
        country_list = arg.country
    else:
        country_list = COUNTRY

    if arg.file:
        for country in country_list:
            filter_by_userbelong(arg.file, country)
    elif arg.folder:
        files = os.listdir(arg.folder)
        for f in files:
            f = os.path.join(arg.folder, f)
            if os.path.isfile(f):
                for country in country_list:
                    filter_by_userbelong(f, country)
示例#6
0
def main():
    """
    Utility to cache messages from all queues from the --hostname provided with 'cache: true' option set in embers.conf
    --hostname  : Cache all active queues on this host
    --log_file  : Path to write the log file to
    --log_level : Logging level
    """
    from etool import args
    global log

    arg_parser = args.get_parser()
    arg_parser.add_argument('--hostname', metavar='HOSTNAME', type=str, default=environ.get('HOSTNAME', None),
                            help="The hostname of the machine whose services' data you wish to cache")
    arg = arg_parser.parse_args()

    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)
    conf.init(arg)

    assert arg.hostname, '--hostname must be provided'
    queues = conf.get_all_cached_queues(hostname=arg.hostname)
    pool = []

    for queue in queues:
        log.info('Spawning cache process for %s' % queue)
        p = multiprocessing.Process(name=queue, target=cache_queue, args=(queue,))
        p.start()
        pool.append(p)

    try:
        for process in pool:
            process.join()
            log.warn('%s caching has stopped' % process.name)
    except KeyboardInterrupt:
        log.warn('Keyboard interrupt in main')
示例#7
0
def parse_args():
    ap = args.get_parser()
    ap.add_argument('-c', '--model_cfg', metavar="MODEL_CFG",
                    default=os.path.join(os.path.dirname(__file__), "bayesian_model.conf"),
                    type=str,nargs='?', help='the config file')

    ''' PORTS '''
    ap.add_argument('-zs', '--surrogate_port', metavar="SURROGATE_PORT", default="tcp://*:30114",
                    type=str, nargs="?", help="The zmq port")
    ap.add_argument('-zw', '--warning_port', metavar="WARNING_PORT", default="tcp://*:30115",
                    type=str,nargs="?", help="The zmq port")

    '''DOMAINS'''
    ap.add_argument('--surrogate_domain', metavar="SURROGATE_DOMAIN", default="t_surrogatedata",
                    type=str, nargs="?", help="The SimpleDB domain for storing surrogate data")
    ap.add_argument('--warning_domain', metavar="WARNING_DOMAIN", default="t_warningmessage",
                    type=str, nargs="?", help="The SimpleDB domain for storing warning data")

    ''' Setting up time parameters to allow for running model in the past '''
    utc_dt = T_UTC.localize(datetime.utcnow())
    eas_dt = utc_dt.astimezone(T_EASTERN)
    default_day = datetime.strftime(eas_dt + timedelta(days =1),"%Y-%m-%d")
    ap.add_argument('--predict_date', metavar="PREDICT_DATE", type=str,
                    default=default_day, nargs="?", help="The day to be predicted")
    ap.add_argument('--stock_list', metavar="Stock List", type=str,
                    nargs="+", help="The list of stock to be predicted")
    ap.add_argument('--rege_date', metavar="REGE_DATE", type=str,
                    help="The date need to be regerated")

    return ap.parse_args()
示例#8
0
def main():
    # Initialize arguments
    argparser = args.get_parser()
    argparser.add_argument('--json_file',
                           help='JSON file to publish',
                           required=True)
    arg = argparser.parse_args()

    queue.init(arg)
    writer = queue.open(arg.pub,
                        'pub',
                        ssh_key=arg.ssh_key,
                        ssh_conn=arg.tunnel)

    try:
        msg_reader = codecs.open(arg.json_file, encoding='utf-8', mode='r')
        message = msg_reader.readline()
        while message:
            writer.write(json.loads(message))
            message = msg_reader.readline()

        msg_reader.close()
    except KeyboardInterrupt:
        pass

    return 0
示例#9
0
def main():
    ap = args.get_parser()
    ap.add_argument("--dir", type=str, help="directory of company member")
    ap.add_argument("--o", type=str, help="the directory of output ")
    arg = ap.parse_args()

    assert arg.dir, 'Need a dir to explor'
    rules = {}
    print arg
    os.chdir(arg.dir)
    for f in glob.glob('*.csv'):
        stock = f.split(".")[0].split("_")[1]
        country = COUNTRY_MARKET[stock]
        rules[country] = []
        with open(f, 'r') as f_r:
            i = 0
            for line in f_r:
                i += 1
                if i >= 2:
                    l = line.strip().split(",")
                    company = l[2]
                    if company == "":
                        continue
                    tmp = company.split(" ")
                    if len(tmp) > 1:
                        tmp = tmp[0:len(tmp) - 1]

                    if company is not None:
                        rules[country].append(company.strip())

        rules[country].append(country)

    with open(arg.o, "w") as o_w:
        o_w.write(json.dumps(rules))
示例#10
0
def main():
    ap = args.get_parser()
    ap.add_argument('-i', '--inputFolder', type=str,
                    help='inputFolder contaning twitter files',
                    default='/hdd/tweets/2012/may')
    ap.add_argument('-s', '--scoresFolder', type=str,
                    help='Folder contaning scoreCards',
                    default='../data/scores/MX/')
    ap.add_argument('-cf', '--configFile', type=str,
                    help='election configuration file',
                    default='../configFiles/electionConfig_MX')
    ap.add_argument('-d1', '--fromDate', type=str,
                    help='fromDate')
    ap.add_argument('-d2', '--toDate', type=str,
                    help='toDate')
    ap.add_argument('-f1', '--flag1', help="countOrPredict",
                    type=str, default='2')
    ap.add_argument('-r', '--regression', help="regressionType",
                    type=str, default='LASSO')
    ap.add_argument('-f2', '--flag2', help="flag to push surrogates and warning to S3",
                    type=str, default='0')
    arg = ap.parse_args()
    logs.init(arg)

    try:
        elections = Elections(arg.inputFolder, arg.scoresFolder,
                              arg.configFile, arg.fromDate, arg.toDate)
        log.info("Election class initialized")
    except Exception as e:
        log.exception("exception during intialization: %s. Quitting!!", e)

    try:
        if (arg.flag1 == '1' or arg.flag1 == '3'):
            elections.collectMentions()
    except Exception as e:
        log.exception("error while tracking tweets")

    try:
        if (arg.flag1 == '2' or arg.flag1 == '3'):
            winner, winningScore, runnerUp, runnerUpScore, finalScore = elections.getWinner(arg.fromDate, arg.toDate, arg.regression)
            print "------------Regression Results-----------"
            print finalScore
            print winner + "====>" + str(winningScore)
            print "-----------------------------------------"
    except Exception as e:
        log.exception("error while calculating winner:%s", e)

    try:
        elections.createSurrogate(winner, winningScore, runnerUp, runnerUpScore, arg.flag2)
    except Exception as e:
        log.exception("error during creating warnings")

    try:
        if (arg.flag2 == '1'):
            elections.storeStatistics(arg.fromDate, arg.toDate)
    except Exception as e:
        log.exception("error in storing statistics:%s", e)

    log.info("ALL Operations Complete")
示例#11
0
def main():
	# Initialize arguments
	argparser = args.get_parser()
	argparser.add_argument('--local_port', help='Local port to connect to java server', required=True)
	arg = argparser.parse_args()
		
	localPort = int(arg.local_port)

	# Initialize log
	logs.init(arg)
	global log
	
	# Initialize the queue with arguments and connect to the specified feed
	log.info("Opening and connecting to queue %s", arg.sub)
	queue.init(arg)
	reader = queue.open(arg.sub, 'sub', ssh_key=arg.ssh_key, ssh_conn=arg.tunnel)
	
	# Initialize the writer to publish to a queue
	log.info("Publishing to queue %s", arg.pub)
	writer = queue.open(arg.pub, 'pub', ssh_key=arg.ssh_key, ssh_conn=arg.tunnel)
	

	count = 0
	# Connect to Java server
	while True:
		for feedmsg in reader:
			try:
				while True:
					try:
						sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
						sock.connect(("localhost", localPort))
						break
					except:
						log.info("Unable to connect to local server")

				log.debug("Connected to java server on port %d" % localPort)

				socketLines = sock.makefile()

				# Clean the message to fix irregularities
				feedmsg = message.clean(feedmsg)

				log.debug("Read message %d. Sending to java" % count)
				# Write message to socket stream
				sock.sendall(json.dumps(feedmsg))
				sock.sendall('\n')

				# Receive result from socket stream
				result = socketLines.readline()
				writer.write(json.dumps(result))
				count += 1

				sock.close()
			except KeyboardInterrupt:
				sys.exit(1)
			else:
				log.info("Server was disconnected.")
示例#12
0
def parse_arg():
    ap = args.get_parser()
    ap.add_argument('--cat',
                    action="store_true",
                    help="Flag: input from stdin")
    ap.add_argument('--user', dest='user_file', type=str, help='User file')
    ap.add_argument('--folder', type=str, help="tweets folder")
    ap.add_argument('--month', type=str)
    arg = ap.parse_args()
    return arg
示例#13
0
def main():
    ap = args.get_parser()
    ap.add_argument("--da", help="the updated news file for analyze")
    ap.add_argument("--k", type=int, help="the number of topic")
    arg = ap.parse_args()

    assert arg.da, "Please input a news file"
    assert arg.k, "Please input the number of topics"

    compute_topic_daily(arg.da, arg.k)
示例#14
0
def main():
    '''
    Reads the  from the queue, retrieves the content
    from the source website and publishes the content to a new queue.
    '''
    ap = args.get_parser()
    ap.add_argument(
        '--cat',
        action="store_true",
        help='Read input from standard in and write to standard out.')
    arg = ap.parse_args()
    logs.init(arg)
    geo_mena = GeoMena()
    geo_lac = Geo(geo_region=GEO_REGION.lac)
    try:
        if arg.cat:
            log.debug('Reading from stdin and writing to stdout.')
            ins = sys.stdin
            outs = sys.stdout
            for entry in ins:
                entry = entry.decode(encoding='utf-8')
                try:
                    tweet = json.loads(entry.strip())
                    geo_annotate(tweet, geo_mena, geo_lac)
                    if tweet is not None:
                        outs.write(
                            json.dumps(tweet,
                                       ensure_ascii=False).encode("utf-8"))
                        outs.write('\n')
                        outs.flush()
                except Exception:
                    log.exception('Failed to process message "%s".', (entry, ))

        else:
            queue.init(arg)
            with queue.open(arg.sub, 'r') as inq:
                with queue.open(arg.pub, 'w', capture=True) as outq:
                    for tweet in inq:
                        try:
                            content = geo_annotate(tweet, geo_mena, geo_lac)
                            if content is not None:
                                outq.write(content)
                        except KeyboardInterrupt:
                            log.info("Got SIGINT, exiting.")
                            break
                        except Exception:
                            log.exception('Failed to process message "%s".',
                                          (tweet, ))

        return 0

    except Exception as e:
        log.exception("Unknown error in main function-{}".format(str(e)))
        return 1
示例#15
0
def main():
    ap = args.get_parser()
    ap.add_argument('--df', help='The datafile to train')
    ap.add_argument('--od', help='the directory to store output')
    arg = ap.parse_args()

    lda = Lda()
    lda.doc_process(arg.df)
    lda.out_put_wordmap(arg.od)
    lda.estimate()
    lda.out_put_result(arg.od)
def parse_arg():
    ap = args.get_parser()
    ap.add_argument('--cat', action="store_true",
                    help="Flag: input from stdin")
    ap.add_argument('--user', dest='user_file',
                    type=str, help='User file')
    ap.add_argument('--folder', type=str,
                    help="tweets folder")
    ap.add_argument('--month', type=str)
    arg = ap.parse_args()
    return arg
def main():
    '''
    Reads the  from the queue, retrieves the content
    from the source website and publishes the content to a new queue.
    '''
    ap = args.get_parser()
    ap.add_argument('--cat', action="store_true",
                    help='Read input from standard in and write to standard out.')
    ap.add_argument('--region', metavar='REGION', type=str, default=None,
                    help='Specify region to filter by')
    arg = ap.parse_args()
    logs.init(arg)
    filter_region = arg.region
    geoc = GeoCountry()
    try:
        if arg.cat:
            log.debug('Reading from stdin and writing to stdout.')
            ins = sys.stdin
            outs = codecs.getwriter('utf-8')(sys.stdout)
            for entry in ins:
                entry = entry.decode(encoding='utf-8')
                try:
                    tweet = json.loads(entry.strip())
                    tweet = annotate(tweet, geoc, filter_region)
                    if tweet is not None:
                        outs.write(json.dumps(tweet, ensure_ascii=False))
                        outs.write('\n')
                        outs.flush()
                except Exception:
                    log.exception('Failed to process message "%s".', entry)

        else:
            queue.init(arg)
            iqueue.init(arg)
            qname = "{}-geoCountry-{}".format(os.environ["CLUSTERNAME"], filter_region)
            with iqueue.open(arg.sub, 'r', qname=qname) as inq:
                with queue.open(arg.pub, 'w') as outq:  # , capture=True) as outq:
                    for tweet in inq:
                        try:
                            content = annotate(tweet, geoc, filter_region)
                            if content is not None:
                                outq.write(content)
                        except KeyboardInterrupt:
                            log.info("Got SIGINT, exiting.")
                            break
                        except Exception:
                            log.exception('Failed to process message "%s".', tweet)

        return 0

    except Exception as e:
        log.exception("Unknown error in main function-{0!s}.".format(e))
        return 1
示例#18
0
def parse_args():
    T_UTC = pytz.utc
    T_EASTERN = pytz.timezone("US/Eastern")
    ap = args.get_parser()
#    ap.add_argument('-f',dest="bloomberg_price_file",metavar="STOCK PRICE",type=str,help='The stock price file')
    # read stdin, write stdout ap.add_argument('-t',dest="trend_file",metavar="TREND RANGE FILE",default="./trendRange.json", type=str,nargs='?',help="The trend range file")
    utc_dt = T_UTC.localize(datetime.utcnow())
    eas_dt = utc_dt.astimezone(T_EASTERN)
    default_day = datetime.strftime(eas_dt, "%Y-%m-%d")
    ap.add_argument('-d', dest="operate_date", metavar="OPERATE DATE", type=str, default=default_day, nargs="?", help="The day to be processed")
    ap.add_argument('-sd', dest="start_date", metavar="START OPERATE DATE", type=str, nargs="?", help="The day to be processed")
    ap.add_argument('-ed', dest="end_date", metavar="END OPERATE DATE", type=str, nargs="?", help="The day to be processed")
    return ap.parse_args()
示例#19
0
def main():
    ap = args.get_parser()
    ap.add_argument('--replay', action="store_true", help="Test Flag, if contain this argument, it means a test case")
    #if the rule file is not indicated in argument, it need to be load from sys.stdin
    ap.add_argument('--rulefile', type=str, help="The rule file for duration analysis model")
    arg = ap.parse_args()

    if not arg.replay:
        assert arg.sub, 'Need a queue to subscribe to'
    assert arg.pub, 'Need a queue to publish to'

    logs.init(arg)
    queue.init(arg)
    test_flag = arg.replay
    if arg.rulefile:
        rule = eval(open(arg.rulefile).read())
    else:
        #load the rules from sys.stdin
        rule = eval(sys.stdin.read())

    conn = boto.connect_sdb()

    if not arg.replay:
        with queue.open(arg.sub, 'r') as inq:
            for m in inq:
                try:
                    replayIO = StringIO.StringIO()
                    durationProcess(rule, conn, m, arg.pub, test_flag, replayIO)
                except KeyboardInterrupt:
                    log.info('GOT SIGINT, exiting!')
                    break
                except EmbersException as e:
                    log.exception(e.value)
                except:
                    log.exception("Unexpected exception in process")
    else:
        #replay model take enriched file as input
        enrich_messages = sys.stdin.readlines()
        for m in enrich_messages:
            m = json.loads(m.strip())
            try:
                replayIO = StringIO.StringIO()
                durationProcess(rule, conn, m, arg.pub, test_flag, replayIO)
            except KeyboardInterrupt:
                log.info('GOT SIGINT, exiting!')
                break
            except EmbersException as e:
                log.exception(e.value)
            except:
                log.exception("Unexpected exception in process")
示例#20
0
def main():
    ap = args.get_parser()
    ap.add_argument('--filedir', type=str, help="analysis files")
    ap.add_argument('--window', type=int, default=7)
    ap.add_argument('--result', type=str, help="result")
    arg = ap.parse_args()

    start_date = "2012-12-08"
    end_date = "2013-05-31"
    dates = date_seed(start_date, end_date)
    for d in dates:
        detector = Detector(d, arg.window, arg.filedir, arg.result)
        detector.load_files()
        detector.detect()
示例#21
0
def main():
    ap = args.get_parser()
    ap.add_argument('--filedir', type=str, help="analysis files")
    ap.add_argument('--window', type=int, default=7)
    ap.add_argument('--result', type=str, help="result")
    arg = ap.parse_args()

    start_date = "2012-12-08"
    end_date = "2013-05-31"
    dates = date_seed(start_date, end_date)
    for d in dates:
        detector = Detector(d, arg.window, arg.filedir, arg.result)
        detector.load_files()
        detector.detect()
示例#22
0
def main():
    ap = args.get_parser()
    ap.add_argument('--o', type=str, help="the output dir to store news")
    arg = ap.parse_args()
    
    assert arg.o, 'Need a dir to store news'
    logs.init(arg)
    locale.setlocale(locale.LC_TIME, 'es_ES.utf-8')
    
    seen_it = shelve.open('elfinance_seen_it.db')
    
    cas = ['finanzas']
    for ca in cas:
        get_category_news(ca, seen_it, arg.o)
示例#23
0
def main():
    '''
    Reads the  from the queue, retrieves the content
    from the source website and publishes the content to a new queue.
    '''
    ap = args.get_parser()
    ap.add_argument('--cat', action="store_true",
                    help='Read input from standard in and write to standard out.')
    arg = ap.parse_args()
    logs.init(arg)
    geo_mena = GeoMena()
    geo_lac = Geo(geo_region=GEO_REGION.lac)
    try:
        if arg.cat:
            log.debug('Reading from stdin and writing to stdout.')
            ins = sys.stdin
            outs = sys.stdout
            for entry in ins:
                entry = entry.decode(encoding='utf-8')
                try:
                    tweet = json.loads(entry.strip())
                    geo_annotate(tweet, geo_mena, geo_lac)
                    if tweet is not None:
                        outs.write(json.dumps(tweet, ensure_ascii=False).encode("utf-8"))
                        outs.write('\n')
                        outs.flush()
                except Exception:
                    log.exception('Failed to process message "%s".', (entry,))

        else:
            queue.init(arg)
            with queue.open(arg.sub, 'r') as inq:
                with queue.open(arg.pub, 'w', capture=True) as outq:
                    for tweet in inq:
                        try:
                            content = geo_annotate(tweet, geo_mena, geo_lac)
                            if content is not None:
                                outq.write(content)
                        except KeyboardInterrupt:
                            log.info("Got SIGINT, exiting.")
                            break
                        except Exception:
                            log.exception('Failed to process message "%s".', (tweet,))

        return 0

    except Exception as e:
        log.exception("Unknown error in main function-{}".format(str(e)))
        return 1
示例#24
0
def main():
    """
    Utility for warnings stored in Elasticsearch
    --log_file     : Path to write the log file to
    --log_level    : Logging level
    """
    from etool import args
    global log

    arg_parser = args.get_parser()
    arg = arg_parser.parse_args()

    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)

    print(query(max_results=30))
示例#25
0
def main():
    ap = args.get_parser()
    ap.add_argument('--f', type=str, help='the newes file')

    arg = ap.parse_args()

    assert arg.f, 'Need a file to ingest'
    assert arg.pub, 'Need a queue to publish'

    logs.init(arg)
    queue.init(arg)

    with queue.open(arg.pub, 'w') as q_w, open(arg.f, 'r') as f_r:
        for line in f_r:
            news = json.loads(line)
            q_w.write(news)
示例#26
0
def main():

    ap = args.get_parser()
    ap.add_argument('--fd', help='the directory of the files needed to processed')
    ap.add_argument('--f', help='the file need to be prcessed')
    ap.add_argument('--o', help='the output file name')

    arg = ap.parse_args()
    assert arg.o, 'Need a output file'

    with codecs.open(arg.o, encoding='ascii', mode="w") as out_f:
        if arg.fd:
            process(arg.fd, out_f)
        elif arg.f:
            transfer2ldaf(arg.f, out_f)
        else:
            pass
示例#27
0
def main():
    NET_TYPE = {"c": comprehend_network, "u": user2user_network,
                "t": content_based_network, "e": entity_network,
                "r": entity_corr_network}
    ap = args.get_parser()
    ap.add_argument('--out', type=str, help='graph output folder',
                    default='./')
    ap.add_argument('--inf', type=str, help='tweet input folder')
    ap.add_argument('--infiles', type=str, nargs='+',
                    help='list of files to be handled')
    ap.add_argument('--c', type=str, nargs='+',
                    help='list of country')
    ap.add_argument('--dirf', type=str,
                    help='The folder directly to he handled')
    ap.add_argument('--net', type=str,
                    help="type of network,each symbol represent each type:c")
    ap.add_argument('--thre', type=float, default=0.1,
                    help="threshold for corr")
    arg = ap.parse_args()

    assert arg.net, "Please input a network type"
    global threshold
    threshold = arg.thre

    if arg.c and len(arg.c) > 0:
        country_list = arg.c
    else:
        country_list = COUNTRY

    if arg.inf:
        for country in country_list:
            in_folder = os.path.join(arg.inf, country.replace(" ", ""))
            out_folder = os.path.join(arg.out, "graph")
            out_folder = os.path.join(out_folder, country.replace(" ", ""))
            for t in arg.net:
                net_type = NET_TYPE.get(t)
                handle_by_folder(in_folder, out_folder, country, net_type)
    elif arg.infiles:
            for f in arg.infiles:
                for t in arg.net:
                    net_type = NET_TYPE.get(t)
                    handle_by_file(arg.out, f, country_list[0], net_type)
    elif arg.dirf:
        for t in arg.net:
            net_type = NET_TYPE.get(t)
            handle_by_folder(arg.dirf, arg.out, country_list[0], net_type)
示例#28
0
def main():
    ap = args.get_parser()
    ap.add_argument('--db', help="the path of sqlite db")
    ap.add_argument('--ts', type=str, nargs='+', help="the list of tickers")
    ap.add_argument('--merge', type=str, nargs='+', help='merge files with same date')
    arg = ap.parse_args()

    ts = arg.ts
    db = arg.db
    m_list = arg.merge

    if m_list:
        merg_data(m_list)

    if ts and db:
        conn = lite.connect(arg.db)
        ts = arg.ts
        get_data(conn, ts)
示例#29
0
def main():
    ap = args.get_parser()
    ap.add_argument('--out', help="the output file of warnings")
    arg = ap.parse_args()

    assert arg.sub, 'Need a queue to subcribe!'
    assert arg.out, 'Need a file to store warnings!'

    logs.init(arg)
    queue.init(arg)
    out_file =  arg.out

    with queue.open(arg.sub, 'r') as q_r:
        for m in q_r:
            with open(out_file, "a") as out_w:
                if not check_ifexist(m):
                    out_w.write(json.dumps(m) + "\n")
                else:
                    print "Duplicated Warnings"
示例#30
0
def main():
    ap = args.get_parser()
    ap.add_argument('--out', help="the output file of warnings")
    arg = ap.parse_args()

    assert arg.sub, 'Need a queue to subcribe!'
    assert arg.out, 'Need a file to store warnings!'

    logs.init(arg)
    queue.init(arg)
    out_file = arg.out

    with queue.open(arg.sub, 'r') as q_r:
        for m in q_r:
            with open(out_file, "a") as out_w:
                if not check_ifexist(m):
                    out_w.write(json.dumps(m) + "\n")
                else:
                    print "Duplicated Warnings"
示例#31
0
def main():
    """
    Utility to cache messages from all queues from the --hostname provided with 'cache: true' option set in embers.conf
    --hostname  : Cache all active queues on this host
    --log_file  : Path to write the log file to
    --log_level : Logging level
    """
    from etool import args
    global log

    arg_parser = args.get_parser()
    arg_parser.add_argument(
        '--hostname',
        metavar='HOSTNAME',
        type=str,
        default=environ.get('HOSTNAME', None),
        help=
        "The hostname of the machine whose services' data you wish to cache")
    arg = arg_parser.parse_args()

    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)
    conf.init(arg)

    assert arg.hostname, '--hostname must be provided'
    queues = conf.get_all_cached_queues(hostname=arg.hostname)
    pool = []

    for queue in queues:
        log.info('Spawning cache process for %s' % queue)
        p = multiprocessing.Process(name=queue,
                                    target=cache_queue,
                                    args=(queue, ))
        p.start()
        pool.append(p)

    try:
        for process in pool:
            process.join()
            log.warn('%s caching has stopped' % process.name)
    except KeyboardInterrupt:
        log.warn('Keyboard interrupt in main')
示例#32
0
def main():
    ap = args.get_parser()
    ap.add_argument('-i', '--input', default='sys.stdin', type=str, help='Path to the input file.'
                    'Default is sys.stdin')
    ap.add_argument('-o', '--out', default='sys.stdout', type=str, help='Path to the output file.'
                    'Default is sys.stdout')
    ap.add_argument('searchPhrase', default='config/phrases.txt', type=str, help='Path to '
                    'the Phrase File if "-f" flag is specified, else the input string is considered'
                    'to be the phrase.')
    ap.add_argument('-f', '--file', action='store_true', default=False, help='If given, then the '
                    'the searchPhrase argument is interpreted as path to a file')
    global logger
    logger = logs.getLogger("%s-%s.log" % (__processor__, str(datetime.now())))
    arg = ap.parse_args()
    logs.init(args)
    inputFile = None
    outFile = None
    phraseFile = None

    if arg.input == 'sys.stdin':
        reader = codecs.getreader('utf-8')(sys.stdin)
    else:
        inputFile = open(arg.input, "r")
        reader = codecs.getreader('utf-8')(inputFile)
    if arg.out == 'sys.stdout':
        writer = codecs.getwriter('utf-8')(sys.stdout)
    else:
        outFile = codecs.open(arg.out, "w", encoding="utf-8")
        writer = codecs.getwriter('utf-8')(outFile)
    if arg.file:
        phraseFile = codecs.open(arg.searchPhrase, encoding='utf-8')
        generatePhraseList(phraseFile.readlines())
    else:
        generatePhraseList([arg.searchPhrase])
    phraseSearch(reader, writer)
    #close all files
    if inputFile:
        inputFile.close()
    if outFile:
        outFile.close()
    if phraseFile:
        phraseFile.close()
示例#33
0
def main():
    svm_twitter = SVM_Twitter(0.1, 0.1, 'rbf')
    ap = args.get_parser()
    ap.add_argument("--pca_num", default=8, type=int)
    ap.add_argument("--net", type=str)
    ap.add_argument("--k", type=int)
    ap.add_argument("--inf", type=str, help="input folder")
    ap.add_argument("--o_surr", type=str, help="output surrogate file")
    arg = ap.parse_args()
    folder = {
        "t": "content",
        "c": "comprehend",
        "u": "user2user",
        "e": "entity"
    }

    assert arg.pub, "Please input a queue to publish surrogate"
    queue.init(arg)
    send_queue = queue.open(arg.pub, "w")
    surr_w = open(arg.o_surr, "w")
    for country in COUNTRY:
        train_file = os.path.join(
            arg.inf, "%s_train_%d" % (country.replace(" ", ""), arg.k))
        test_file = os.path.join(
            arg.inf, "%s_test_%d" % (country.replace(" ", ""), arg.k))
        svm_twitter.load_data(train_file, test_file)
        svm_twitter.normalize()
        #svm_twitter.normalize()
        #svm_twitter.pca(arg.pca_num)
        svm_twitter.fit()
        svm_twitter.predict()

        for day in svm_twitter.novel_days:
            surrogate = {"country": country, "date": day.strftime("%Y-%m-%d")}
            send_queue.write(surrogate)
            surr_w.write(json.dumps(surrogate) + "\n")

        print "prediction result: %s " % country
        print[day.strftime("%Y-%m-%d") for day in svm_twitter.novel_days]
    surr_w.flush()
    surr_w.close()
    send_queue.close()
示例#34
0
def main():
    """
    Utility to set up a mapping for an EMBERS queue in Elasticsearch
    -q | --queue     : Queue name to set up the mapping for. Settings are read from embers.conf
    --log_file  : Path to write the log file to
    --log_level : Logging level
    """
    from etool import args
    global log

    arg_parser = args.get_parser()
    arg_parser.add_argument('-q', '--queue', help='Queue name to map into Elasticsearch')
    arg = arg_parser.parse_args()

    assert arg.queue, '--queue must be provided'

    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)

    add_type(index_name=general.get_index_name(), type_name=arg.queue)
示例#35
0
def main():
    ap = args.get_parser()
    default_day = datetime.strftime(datetime.now(), "%Y-%m-%d")
    ap.add_argument("--d", type=str, default=default_day, help="The day to ingest, Format: dd/mm/yyyy")
    ap.add_argument("--domain", default="bloomberg_prices", help="The simpleDB table to store raw data")
    arg = ap.parse_args()

    assert arg.pub, "Need a queue to publish"
    logs.init(arg)
    queue.init(arg)

    with queue.open(arg.pub, "w") as out_q:
        for stock in STOCK_CON:
            if stock == "COLCAP":
                scrape_f = scrape_colcap_url
            if stock == "CHILE65":
                scrape_f = scrape_chile65_url
            msg = ingest_price(arg, stock, scrape_f)
            if msg is not None:
                out_q.write(msg)
                store(arg, msg)
示例#36
0
def main():
	# Initialize arguments
	argparser = args.get_parser()
	argparser.add_argument('--json_file', help='JSON file to publish', required=True)
	arg = argparser.parse_args()
	
	queue.init(arg)
	writer = queue.open(arg.pub, 'pub', ssh_key=arg.ssh_key, ssh_conn=arg.tunnel)
	
	try:
		msg_reader = codecs.open(arg.json_file, encoding='utf-8', mode='r')
		message = msg_reader.readline()
		while message:
			writer.write(json.loads(message))
			message = msg_reader.readline()
		
		msg_reader.close()
	except KeyboardInterrupt:
		pass
	
	return 0
示例#37
0
def main():
    ap = args.get_parser()
    ap.add_argument('-c', '--conf', metavar='CONF', type=str, nargs='?', 
                    default=os.path.join(os.path.dirname(__file__), 'bloomberg_news_ingest.conf'),
                    help='The location of the configuration file.')
    arg = ap.parse_args()
    assert arg.pub, "--pub required. Need a queue to publish on"

    logs.init(arg)
    conf = get_conf(arg.conf)
    seen_it = shelve.open("bloomberg_news_seen_it.db")
    
    try:
        with queue.open(arg.pub, 'w', capture=True) as outq:
            for (index, companies) in conf.items():
                for company in companies:
                    articles = get_stock_news(index, company, seen_it)
                    for a in articles:
                        outq.write(a)

    except KeyboardInterrupt:
        log.info('GOT SIGINT, exiting')
def main():
    ap = args.get_parser()
    ap.add_argument('--dir')
    arg = ap.parse_args()

    assert arg.pub, "Enter a queue to pub"

    file_folder = arg.dir
    files = os.listdir(file_folder)
    w_queue = queue.open(arg.pub, "w", capture=True)

    for f in files:
        full_f = os.path.join(file_folder, f)
        with open(full_f) as af:
            for d_ana in af:
                temp = d_ana.strip().split("|")
                message = {"country": temp[1],
                           "date": temp[0],
                           "z_value": temp[2],
                           "diff_mag": temp[3]}
                w_queue.write(message)
    w_queue.close()
示例#39
0
def main():
    svm_twitter = SVM_Twitter(0.1, 0.1, 'rbf')
    ap = args.get_parser()
    ap.add_argument("--pca_num", default=8, type=int)
    ap.add_argument("--net", type=str)
    ap.add_argument("--k", type=int)
    ap.add_argument("--inf", type=str, help="input folder")
    ap.add_argument("--o_surr", type=str, help="output surrogate file")
    arg = ap.parse_args()
    folder = {"t": "content", "c": "comprehend", "u": "user2user",
              "e": "entity"}

    assert arg.pub, "Please input a queue to publish surrogate"
    queue.init(arg)
    send_queue = queue.open(arg.pub, "w")
    surr_w = open(arg.o_surr, "w")
    for country in COUNTRY:
        train_file = os.path.join(arg.inf,
                                  "%s_train_%d" % (country.replace(" ", ""), arg.k))
        test_file = os.path.join(arg.inf,
                                 "%s_test_%d" % (country.replace(" ", ""), arg.k))
        svm_twitter.load_data(train_file, test_file)
        svm_twitter.normalize()
        #svm_twitter.normalize()
        #svm_twitter.pca(arg.pca_num)
        svm_twitter.fit()
        svm_twitter.predict()

        for day in svm_twitter.novel_days:
            surrogate = {"country": country, "date": day.strftime("%Y-%m-%d")}
            send_queue.write(surrogate)
            surr_w.write(json.dumps(surrogate)+ "\n")

        print "prediction result: %s " % country
        print [day.strftime("%Y-%m-%d") for day in svm_twitter.novel_days]
    surr_w.flush()
    surr_w.close()
    send_queue.close()
def main():
    ap = args.get_parser()
    ap.add_argument('--level', type=str, default="0.6",
                    help='The threhold')
    ap.add_argument('--svm', action='store_true')
    ap.add_argument('--zmq', action='store_true')
    ap.add_argument('--surr', type=str, help="surrogate file")
    ap.add_argument('--warn', type=str, help="warning file")
    arg = ap.parse_args()

    logs.init(arg)
    queue.init(arg)
    assert arg.pub, "Please input a queue to publish warning"
    if arg.zmq:
        assert arg.sub, "Please input a queue to sub surrogate message"
    conn = boto.connect_sdb()
    t_domain = get_domain(conn, "s_holiday")

    if arg.zmq:
        with queue.open(arg.sub, 'r') as inq:
            for m in inq:
                try:
                    if arg.svm:
                        svm_warning(t_domain, m, arg.pub)
                    else:
                        warning_center(t_domain, m, arg.pub, float(arg.level))
                except KeyboardInterrupt:
                    log.info('GOT SIGINIT, exiting!')
                    break
                except:
                    log.exception("Exception in Process:%s" % sys.exc_info()[0])
    else:
        with open(arg.warn, "w") as w, open(arg.surr) as r:
            if arg.svm:
                for m in r:
                    m = json.loads(m)
                    warning = svm_warning(t_domain, m, arg.pub)
                    w.write(json.dumps(warning) + "\n")
def main():
    ap = args.get_parser()
    ap.add_argument('--level', type=str, default="0.6", help='The threhold')
    ap.add_argument('--svm', action='store_true')
    ap.add_argument('--zmq', action='store_true')
    ap.add_argument('--surr', type=str, help="surrogate file")
    ap.add_argument('--warn', type=str, help="warning file")
    arg = ap.parse_args()

    logs.init(arg)
    queue.init(arg)
    assert arg.pub, "Please input a queue to publish warning"
    if arg.zmq:
        assert arg.sub, "Please input a queue to sub surrogate message"
    conn = boto.connect_sdb()
    t_domain = get_domain(conn, "s_holiday")

    if arg.zmq:
        with queue.open(arg.sub, 'r') as inq:
            for m in inq:
                try:
                    if arg.svm:
                        svm_warning(t_domain, m, arg.pub)
                    else:
                        warning_center(t_domain, m, arg.pub, float(arg.level))
                except KeyboardInterrupt:
                    log.info('GOT SIGINIT, exiting!')
                    break
                except:
                    log.exception("Exception in Process:%s" %
                                  sys.exc_info()[0])
    else:
        with open(arg.warn, "w") as w, open(arg.surr) as r:
            if arg.svm:
                for m in r:
                    m = json.loads(m)
                    warning = svm_warning(t_domain, m, arg.pub)
                    w.write(json.dumps(warning) + "\n")
def main():
    ap = args.get_parser()
    ap.add_argument("--out", type=str,
                    help="the output dir")
    ap.add_argument("--inf", type=str, help="graph files folder")
    ap.add_argument("--files", type=str,
                    nargs='+', help="file list")
    ap.add_argument("--net", type=str)
    ap.add_argument("--c_folder", type=str)
    arg = ap.parse_args()
    assert arg.out, "Please Enter a output dir"
    if arg.inf:
        folders = os.listdir(arg.inf)
        for folder in folders:
            full_f = os.path.join(arg.inf, folder)
            if not os.path.isdir(full_f):
                continue
            analysis_by_folder(full_f, arg.out,  arg.net)
    elif arg.files:
        for f in arg.files:
            analysis_by_file(f, arg.out, arg.net)
    elif arg.c_folder:
        analysis_by_folder(arg.c_folder, arg.out, arg.net)
示例#43
0
def main():
    '''
    Reads the  from the queue, retrieves the content
    from the source website and publishes the content to a new queue.
    '''
    ap = args.get_parser()
    ap.add_argument(
        '--cat',
        action="store_true",
        help='Read input from standard in and write to standard out.')
    ap.add_argument('--region',
                    metavar='REGION',
                    type=str,
                    default=None,
                    help='Specify region to filter by')
    arg = ap.parse_args()
    logs.init(arg)
    filter_region = arg.region
    geoc = GeoCountry()
    try:
        if arg.cat:
            log.debug('Reading from stdin and writing to stdout.')
            ins = sys.stdin
            outs = codecs.getwriter('utf-8')(sys.stdout)
            for entry in ins:
                entry = entry.decode(encoding='utf-8')
                try:
                    tweet = json.loads(entry.strip())
                    tweet = annotate(tweet, geoc, filter_region)
                    if tweet is not None:
                        outs.write(json.dumps(tweet, ensure_ascii=False))
                        outs.write('\n')
                        outs.flush()
                except Exception:
                    log.exception('Failed to process message "%s".', entry)

        else:
            queue.init(arg)
            iqueue.init(arg)
            qname = "{}-geoCountry-{}".format(os.environ["CLUSTERNAME"],
                                              filter_region)
            with iqueue.open(arg.sub, 'r', qname=qname) as inq:
                with queue.open(arg.pub,
                                'w') as outq:  # , capture=True) as outq:
                    for tweet in inq:
                        try:
                            content = annotate(tweet, geoc, filter_region)
                            if content is not None:
                                outq.write(content)
                        except KeyboardInterrupt:
                            log.info("Got SIGINT, exiting.")
                            break
                        except Exception:
                            log.exception('Failed to process message "%s".',
                                          tweet)

        return 0

    except Exception as e:
        log.exception("Unknown error in main function-{0!s}.".format(e))
        return 1
示例#44
0
def main():
    """
    Utility to cache messages from a queue into Elasticsearch
    -q | --queue   : Read from <queue> and write the messages to Elasticsearch. Settings are read from embers.conf
    --log_file     : Path to write the log file to
    --log_level    : Logging level
    """
    from etool import args
    global log

    arg_parser = args.get_parser()
    arg_parser.add_argument('-q',
                            '--queue',
                            help='Queue name to index into Elasticsearch')
    arg_parser.add_argument(
        '-s',
        '--s3fromq',
        action='store_true',
        help='ingest from S3 prefix derived from queue name')
    arg_parser.add_argument('-p', '--prefix', help='Ingest from prefix')
    #arg_parser.add_argument('-t', '--typename', default='noqueue', help='Type for prefix ingest')
    arg_parser.add_argument('-t', '--typename', help='Type for prefix ingest')
    arg_parser.add_argument(
        '-l',
        '--tmpcopy',
        default='/home/embers/data/tmpcopy',
        help='Name of local copy of S3 file (same for all S3 files)')
    arg_parser.add_argument('-c',
                            '--chunk',
                            type=int,
                            default=100,
                            help='Chunk size for S3 ingest')
    arg_parser.add_argument('-i',
                            '--clustername',
                            help='Clustername to determine index name')
    arg_parser.add_argument(
        '-w',
        '--withbase',
        action="store_true",
        help="Add basename to prefix when looking for type.")
    arg_parser.add_argument('--startdate',
                            help='start date in format like 2015-01-02')
    arg_parser.add_argument('--enddate',
                            help='end date in format like 2015-01-02')
    arg = arg_parser.parse_args()

    #assert (arg.queue or (arg.prefix and arg.typename)), 'Either --queue (with optional --s3fromq/--typename) or --prefix with --typename must be provided'
    assert (
        arg.queue or arg.prefix
    ), 'Either --queue (with optional --s3fromq/--typename) or --prefix  must be provided'

    log = logs.getLogger(log_name=arg.log_file)
    logs.init(arg, l=arg.log_level, logfile=arg.log_file)

    index_name = general.get_index_name(arg.clustername)

    queue.init()

    if arg.prefix or (arg.queue and arg.s3fromq):
        if arg.prefix:
            prefix = arg.prefix
            # get queue name or its substitute for S3 objects from prefix
            if arg.typename:
                type_name = arg.typename
            else:
                type_name = queue.conf.get_prefixpair(
                    prefix=prefix, includeS3=True, withBasename=arg.withbase)
                if not type_name:
                    log.error("Could not get type from prefix %s" % prefix)
                    return 1
                log.warning("type_name=%s from prefix=%s" %
                            (type_name, prefix))
        else:
            type_name = arg.queue
            prefix, include = queue.conf.get_prefix_for_queue(
                type_name, withBasename=False)
            if not prefix:
                log.error("Could not get S3 prefix for queue %s" % type_name)
                return 1

        if not general.get_es_connection().indices.exists_type(
                index=index_name, doc_type=type_name):
            # Create mapping if the queue has not been stored in Elasticsearch yet
            index_setup.add_type(index_name=index_name, type_name=type_name)

        conn_s3 = boto.connect_s3(aws_access_key_id=arg.aws_key,
                                  aws_secret_access_key=arg.aws_secret)
        bucket = conn_s3.get_bucket(
            arg.bucket)  # connect to S3, get bucket ptr for arg.bucket
        attach_to_s3(index_name,
                     s3prefix=prefix,
                     bucket=bucket,
                     type_name=type_name,
                     tmpcopy=arg.tmpcopy,
                     chunk_size=arg.chunk,
                     startdate=arg.startdate,
                     enddate=arg.enddate)
    else:

        if arg.typename:
            type_name = arg.typename
        else:
            type_name = arg.queue

        if not general.get_es_connection().indices.exists_type(
                index=index_name, doc_type=type_name):
            # Create mapping if the queue has not been stored in Elasticsearch yet
            index_setup.add_type(index_name=index_name, type_name=type_name)

        attach_to_queue(index_name=index_name,
                        queue_name=arg.queue,
                        type_name=type_name)