def upgrade_model_batch(): enable_crossfeature = Config(ALGOCONFIG_PATH).get('enable-cross-feature') _model = generate_model(REALTIME_MODE, cross_features=enable_crossfeature) model_name = export_model(_model, REALTIME_MODE, enable_crossfeature) Config(ALGOCONFIG_PATH).set('model-name/realtime', model_name) _model = generate_model(BATCH_MODE, cross_features=enable_crossfeature) model_name = export_model(_model, BATCH_MODE, enable_crossfeature) Config(ALGOCONFIG_PATH).set('model-name/batch', model_name)
def __init__(self): access_key_id = Config(DEPLOY_CONFIG_PATH).get('nectar/access_key_id') access_key_secret = Config(DEPLOY_CONFIG_PATH).get('nectar/access_key_secret') region = RegionInfo(name='melbourne', endpoint='nova.rc.nectar.org.au') self.ec2_conn = boto.connect_ec2(aws_access_key_id=access_key_id, aws_secret_access_key=access_key_secret, is_secure=True, region=region, port=8773, path='/Services/Cloud', validate_certs=False)
def export_stats(stats, mode, enable_crossfeature): json_data = stats.to_json() stat_name = _generate_stat_name(mode, enable_crossfeature) filename = MODEL_FILE_BASE_PATH + "{}.json".format(stat_name) if mode == REALTIME_MODE: Config(ALGOCONFIG_PATH).set('model-name/train-stats/realtime', "{}.json".format(stat_name)) else: Config(ALGOCONFIG_PATH).set('model-name/train-stats/batch', "{}.json".format(stat_name)) with open(filename, "w") as json_file: json_file.write(json_data)
def add_instance(self): instance_type = Config(DEPLOY_CONFIG_PATH).get('nectar/instance_type') keypair_name = Config(DEPLOY_CONFIG_PATH).get('nectar/keypair_name') security_groups = Config(DEPLOY_CONFIG_PATH).get('nectar/security_groups') zone = Config(DEPLOY_CONFIG_PATH).get('nectar/zone') image_id = self._get_image_id(Config(DEPLOY_CONFIG_PATH).get('nectar/image_name')) reservation = self.ec2_conn.run_instances(image_id=image_id, key_name=keypair_name, instance_type=instance_type, security_groups=security_groups, placement=zone) instance = reservation.instances[0] instance_info = {"id": instance.id, "ip": instance.private_ip_address, "state": instance.state} logger.info('Successfully created instance: {}'.format(instance_info)) return instance_info
def _calc(self, data, mode): model_name = Config(ALGOCONFIG_PATH).get('model-name/{}'.format( mode_name[mode])) stat_path = MODEL_FILE_BASE_PATH + Config(ALGOCONFIG_PATH).get( 'model-name/train-stats/{}'.format(mode_name[mode])) model = import_model(model_name) train_stats = import_stats(stat_path) data['vector']['label'] = -1 feature_vectors = generate_feature_vectors( items=[data], mode=mode, cross_features=enable_crossfeature) test_set = generate_dataset(data_list=feature_vectors, mode=mode, cross_features=enable_crossfeature) test_set.pop('label') normed_test_data = norm(test_set, train_stats) test_predictions = model.predict(normed_test_data).flatten() return _limit_range(test_predictions[0], lower=0, upper=1)
def __init__(self, db_name): server_config = Config(CONFIG_PATH).get('couchdb') self.client = CouchDB(server_config['username'], server_config['password'], url=server_config['server_addr'], connect=True, auto_renew=True) self.select_db(db_name)
def uclassify_topics(text): try: key = Config(CONFIG_PATH).get('uclassify/apikey') url = 'https://api.uclassify.com/v1/uClassify/Topics/classify' data = {'texts': [text]} header = {'Authorization': 'Token {}'.format(key), 'Content-Type': 'application/json'} response = requests.post(url=url, data=json.dumps(data), headers=header) if response.status_code == 200: resp_data = ast.literal_eval(response.text)[0]['classification'] res = {x['className']: x['p'] for x in resp_data} return res else: keygen = UclassifyKeyGenerator() new_key = keygen.get_key() keygen.close() Config(CONFIG_PATH).set('uclassify/apikey', new_key) return uclassify_topics(text) except Exception as ex: logger.error('Error when uClassifying text: {}'.format(ex))
def add_volume(self, instance_id): zone = Config(DEPLOY_CONFIG_PATH).get('nectar/zone') volume = self.ec2_conn.create_volume(size=50, zone=zone) info = self.get_volume_info(volume.id) while info["status"] != "available": logger.info("Volume state: " + info["status"]) time.sleep(3) info = self.get_volume_info(volume.id) successful = volume.attach(instance_id, "/dev/vdc") if successful: logger.info("volume: " + volume.id + " set and bounded to /dev/vdc") else: logger.error("ERROR: volume creating failed.")
class Mrisa: port = Config(CONFIG_PATH).get('mrisa/port') service = Config(CONFIG_PATH).get('mrisa/server-path') proc = None def start(self): try: self.proc = subprocess.Popen(['python3', self.service, '--port', str(self.port)]) time.sleep(3) logger.info('MRISA service started at port {}.'.format(self.port)) except Exception as ex: logger.error('Error occured when starting MRISA: {}'.format(ex)) def get_image_info(self, image_url): data = json.dumps({"image_url": image_url}) url = 'http://localhost/search' storage = BytesIO() c = pycurl.Curl() c.setopt(c.URL, str(url)) c.setopt(c.PORT, self.port) c.setopt(c.HTTPHEADER, ['Content-Type: application/json']) c.setopt(pycurl.POST, 1) c.setopt(pycurl.POSTFIELDS, data) c.setopt(c.WRITEFUNCTION, storage.write) c.perform() c.close() returned_json = storage.getvalue().decode('UTF-8') return json.loads(returned_json) def stop(self): logger.info('Terminating process pid: {}'.format(self.proc.pid)) self.proc.terminate() self.proc.kill() time.sleep(1) os.system('kill -9 {}'.format(self.proc.pid)) logger.info('MRISA service stopped.')
def __vectorize(self, info1, info2, mode): # todo: handle modes. result = {} profile1, profile2 = info1['profile'], info2['profile'] logger.info('Evaluating usename...') result['username'] = singleword_similarity(profile1, profile2) logger.info('Evaluating profile image...') result['profileImage'] = profile_img_sim(profile1['image'], profile2['image']) logger.info('Evaluating self description text...') result['self_desc'] = self.semantic_sim.similarity( profile1.get('description', ''), profile2.get('description', '')) logger.info('Evaluating self description url...') result['desc_overlap_url_count'] = desc_overlap_url( { 'platform': info1['platform'], 'username': profile1['username'], 'desc': profile1.get('description', '') }, { 'platform': info2['platform'], 'username': profile2['username'], 'desc': profile2.get('description', '') }) posts1 = info1['posts_content'] if 'posts_content' in info1.keys( ) else [] posts2 = info2['posts_content'] if 'posts_content' in info2.keys( ) else [] if mode == REALTIME_MODE and len(posts1) == 0 and len(posts2) == 0: return result logger.info('Evaluating writing style...') result['writing_style'] = writing_style_sim(posts1, posts2) logger.info('Evaluating post similarity...') max_post_enabled = bool( Config(ALGOCONFIG_PATH).get('max-post-similarity-enabled')) result['post_text'] = self.max_post_sim( posts1, posts2) if max_post_enabled else self.overall_post_sim( posts1, posts2) logger.info('Evaluating uClassify topical similarity...') result['uclassify'] = uclassify_similarity( " ".join(_get_post_text(posts1)), " ".join(_get_post_text(posts2))) logger.info('Calculation finished.') return result
import ast import random import os from constant import CONFIG_PATH from similarity.Config import Config CONFIG = Config(CONFIG_PATH) PAIRING_FILE_PATH = CONFIG.get("sampler/pairing_file") MODE = CONFIG.get("sampler/mode") INSTA_FOLDER = CONFIG.get("sampler/instagram_folder") TWITTER_FOLDER = CONFIG.get("sampler/twitter_folder") class Sampler: def __init__(self): self.items = [] if MODE == 'file': with open(PAIRING_FILE_PATH, "r") as file: while True: line = file.readline() if not line: break self.items.append(ast.literal_eval(line)) dictlist = [{x['instagram']: x['twitter']} for x in self.items] self.ins_to_twi_dict = dict(kv for d in dictlist for kv in d.items()) dictlist = [{x['twitter']: x['instagram']} for x in self.items] self.twi_to_ins_dict = dict(kv for d in dictlist for kv in d.items())
def __init__(self): keyfile = Config(CONFIG_PATH).get('google/keyfile_path') os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = keyfile
def __init__(self): subscription_key = Config(CONFIG_PATH).get('microsoft/subscription_key') self.headers = {'Ocp-Apim-Subscription-Key': subscription_key}
time.sleep(5) def getResultTable(self): while True: try: target = self.browser.find_element_by_id("ctl00_ContentPlaceHolder1_tblChart") return target except Exception as e: time.sleep(5) continue def close(self): self.browser.quit() tea_enabled = bool(Config(ALGOCONFIG_PATH).get('tea-enabled')) def query_writing_style(text): if len(text) == 0: return {} text = ''.join(c for c in text if c <= '\uFFFF') try: readbility_metrics = dict(readability.getmeasures(text, lang='en')['readability grades']) except ValueError: readbility_metrics = 0.5 logger.warning('Text is Empty, readability return 0.5 as default.') if tea_enabled: text = ' '.join(text.split(' ')[:300]) tea = TeaUtils() tea_metrics = tea.getTextMetrics(text)
def __init__(self): self.config = Config(CONFIG_PATH) self.semantic_sim = TensorSimilarity()
import datetime from constant import CONFIG_PATH from similarity.Config import Config INFO = 0 WARNING = 1 ERROR = 2 enable_color = Config(CONFIG_PATH).get('logger/color-enabled') write_to_file = Config(CONFIG_PATH).get('logger/write-to-file') log_file_path = Config(CONFIG_PATH).get('logger/logfile-path') def info(message): __print_message(message, INFO) def error(message): __print_message(message, ERROR) def warning(message): __print_message(message, WARNING) def __print_message(message, level): color = {INFO: '\033[0m', WARNING: '\033[93m ', ERROR: '\033[91m '} prefix = {INFO: '(INFO)', WARNING: '(WARNING)', ERROR: '(ERROR)'} time = str(datetime.datetime.now().strftime("[%Y-%m-%d %H:%M:%S]")) log_body = '{prefix} {time} {message}'.format(prefix=prefix[level], time=time, message=message)
from automation.batch.ModelUpgrade import import_model, import_stats, generate_dataset, norm, generate_feature_vectors from constant import ALGOCONFIG_PATH, BATCH_MODE, REALTIME_MODE, MODEL_FILE_BASE_PATH from similarity.Config import Config mode_name = {BATCH_MODE: 'batch', REALTIME_MODE: 'realtime'} enable_crossfeature = Config(ALGOCONFIG_PATH).get('enable-cross-feature') class OverallSimilarityCalculator: def _calc(self, data, mode): model_name = Config(ALGOCONFIG_PATH).get('model-name/{}'.format( mode_name[mode])) stat_path = MODEL_FILE_BASE_PATH + Config(ALGOCONFIG_PATH).get( 'model-name/train-stats/{}'.format(mode_name[mode])) model = import_model(model_name) train_stats = import_stats(stat_path) data['vector']['label'] = -1 feature_vectors = generate_feature_vectors( items=[data], mode=mode, cross_features=enable_crossfeature) test_set = generate_dataset(data_list=feature_vectors, mode=mode, cross_features=enable_crossfeature) test_set.pop('label') normed_test_data = norm(test_set, train_stats) test_predictions = model.predict(normed_test_data).flatten() return _limit_range(test_predictions[0], lower=0, upper=1) def calc(self, data): if has_full_property(data): return self._calc(data, BATCH_MODE) return self._calc(data, REALTIME_MODE)