def __init__(self, nmembers, npolicies, ribdump, rnd_policies=True, member_cap=None, path_templates=None): self.nmembers = nmembers self.npolicies = npolicies self.ribdump = ribdump self.rnd_policies = rnd_policies if path_templates: self.path_templates = path_templates else: self.path_templates = "templates/" if member_cap: self.member_cap = member_cap else: self.member_cap = self.nmembers self.update_template = util.load_json_file(self.path_templates + "update.json") self.sdx_template = util.load_json_file(self.path_templates + "sdx.json") self.route_set = self.parse_routes() self.members = self.gen_ixp_members() self.gen_members_policies()
def __init__(self, dir_path): ''' Constructor Inputs: dir_path: (string) path to the directory that contains the file Initializing five public variables: name: name of dataset predictor_vars: list of all predictor variables dependent_var: dependent variable labels: label of predictor variables and dependent variable data: a list with two elements, the first being the training data and the second being the testing data ''' # REPLACE pass WITH YOUR CODE # Read CVS and JSON files data = util.load_numpy_array(dir_path, "data.csv") parameters = util.load_json_file(dir_path, "parameters.json") # Initializing attributes self.name = parameters["name"] self.predictor_vars = parameters["predictor_vars"] self.dependent_var = parameters["dependent_var"] self.labels = data[0] self.data = train_test_split(data[1], train_size = parameters["training_fraction"],\ test_size = None, random_state = parameters["seed"])
def setup_avatar_db(conn, adb): print("Creating the user avatar database") adb_url = '/' + adb conn.request("PUT", adb_url, headers=ph) resp = conn.getresponse() util.decode_response(resp) if resp.getcode() == 201: print("User Avatar database created.") elif resp.getcode() == 409 or resp.getcode() == 412: print("Avatar database already exists.") else: print("Error creating avatar database.") # Now save the auth document auth_url = adb_url + '/_design/_auth' conn.request("GET", auth_url, headers=gh) resp = conn.getresponse() addoc = util.load_json_file( os.path.join(wf_dir, 'scripts/ddoc/avatar_auth.json')) addoc_old = util.decode_response(resp) if resp.getcode() == 200: print("Avatar auth doc already exists. Updating.") addoc['_rev'] = addoc_old['_rev'] req_body = json.dumps(addoc) conn.request("PUT", auth_url, body=req_body, headers=ph) resp = conn.getresponse() util.decode_response(resp) if resp.getcode() == 200 or resp.getcode() == 201: print("Avatar auth doc saved successfully.") else: print("Avatar auth doc save failed.")
def setup_user_db(conn): # Now we'll set up the CouchDB user database print("\nSetting user database public fields in CouchDB") # First, set the user public fields in the CouchDB config url = '/_config/couch_httpd_auth/public_fields' field = "\"userPublic\"" conn.request("PUT", url, body=field, headers=ph) resp = conn.getresponse() util.decode_response(resp) if resp.getcode() == 200 or resp.getcode() == 201: print("User config updated successfully") else: print("User config update failed!") # Now, set up some views in the user database url = '/_users/_design/user_queries' # Get the user design doc, if it exists conn.request("GET", url, headers=gh) resp = conn.getresponse() old_ddoc = util.decode_response(resp) user_ddoc = util.load_json_file( os.path.join(wf_dir, "scripts/ddoc/user_ddoc.json")) if resp.getcode() != 404: user_ddoc['_rev'] = old_ddoc['_rev'] req_body = json.dumps(user_ddoc) conn.request("PUT", url, body=req_body, headers=ph) resp = conn.getresponse() util.decode_response(resp) if resp.getcode() == 200 or resp.getcode() == 201: print("User design doc saved successfully.") else: print("User design doc save failed.")
def setup_avatar_db(conn, adb): print("Creating the user avatar database") adb_url = "/" + adb conn.request("PUT", adb_url, headers=ph) resp = conn.getresponse() util.decode_response(resp) if resp.getcode() == 201: print("User Avatar database created.") elif resp.getcode() == 409 or resp.getcode() == 412: print("Avatar database already exists.") else: print("Error creating avatar database.") # Now save the auth document auth_url = adb_url + "/_design/_auth" conn.request("GET", auth_url, headers=gh) resp = conn.getresponse() addoc = util.load_json_file(os.path.join(wf_dir, "scripts/ddoc/avatar_auth.json")) addoc_old = util.decode_response(resp) if resp.getcode() == 200: print("Avatar auth doc already exists. Updating.") addoc["_rev"] = addoc_old["_rev"] req_body = json.dumps(addoc) conn.request("PUT", auth_url, body=req_body, headers=ph) resp = conn.getresponse() util.decode_response(resp) if resp.getcode() == 200 or resp.getcode() == 201: print("Avatar auth doc saved successfully.") else: print("Avatar auth doc save failed.")
def setup_user_db(conn): # Now we'll set up the CouchDB user database print("\nSetting user database public fields in CouchDB") # First, set the user public fields in the CouchDB config url = "/_config/couch_httpd_auth/public_fields" field = '"userPublic"' conn.request("PUT", url, body=field, headers=ph) resp = conn.getresponse() util.decode_response(resp) if resp.getcode() == 200 or resp.getcode() == 201: print("User config updated successfully") else: print("User config update failed!") # Now, set up some views in the user database url = "/_users/_design/user_queries" # Get the user design doc, if it exists conn.request("GET", url, headers=gh) resp = conn.getresponse() old_ddoc = util.decode_response(resp) user_ddoc = util.load_json_file(os.path.join(wf_dir, "scripts/ddoc/user_ddoc.json")) if resp.getcode() != 404: user_ddoc["_rev"] = old_ddoc["_rev"] req_body = json.dumps(user_ddoc) conn.request("PUT", url, body=req_body, headers=ph) resp = conn.getresponse() util.decode_response(resp) if resp.getcode() == 200 or resp.getcode() == 201: print("User design doc saved successfully.") else: print("User design doc save failed.")
def parse_json_file(_filename, _already): sellers_set = set() data = util.load_json_file(_filename) for good in data: seller = good['seller'] print(seller) print(_filename) print(good) s = seller.split('/') if len(s) == 0: continue s = s[-1][2:] if not s: continue seller_id = int(s) print(seller_id) unique = False if seller_id not in sellers_set: unique = True if seller_id not in _already: sellers_set.add(seller_id) else: print(seller_id, "is already in file") # use vk api to load user information # only if this id is unique return sellers_set
def test_pull_request_created(app): headers = { 'X-Request-UUID': 'afe23a8c-dde6-4cde-8eaa-3e50077849f4', 'X-Event-Key': 'pullrequest: created', 'X-Event-Time': 'Wed, 10 Jul 2019 20: 23: 28 GMT', 'X-Attempt-Number': '1', 'X-Hook-UUID': 'fee8a257-2939-4b3a-aa64-2e07be1a8fb8', 'User-Agent': 'Bitbucket-Webhooks/2.0', 'Content-Type': 'application/json' } event = load_json_file('./tests/fixtures/pullrequest-created.json') expected_response = load_json_file( './tests/responses/pullrequest-created.json') res = handle_bitbucket_event(event, headers) response = json.loads(res) assert expected_response == response
def test_pull_request_fulfilled(app): headers = { 'X-Request-UUID': 'a607b1c4-be59-4a27-83e5-208de2fa7e81', 'X-Event-Key': 'pullrequest: fulfilled', 'X-Event-Time': 'Wed, 10 Jul 2019 21: 41: 04 GMT', 'X-Attempt-Number': '1', 'X-Hook-UUID': 'fee8a257-2939-4b3a-aa64-2e07be1a8fb8', 'User-Agent': 'Bitbucket-Webhooks/2.0', 'Content-Type': 'application/json' } event = load_json_file('./tests/fixtures/pullrequest-fulfilled.json') expected_response = load_json_file( './tests/responses/pullrequest-fulfilled.json') res = handle_bitbucket_event(event, headers) response = json.loads(res) assert expected_response == response
def test_pull_request_rejected(app): headers = { 'X-Request-UUID': 'a391cc1e-c057-4168-b5fd-2e52b911d5fd', 'X-Event-Key': 'pullrequest: rejected', 'X-Event-Time': 'Wed, 10 Jul 2019 20: 23: 44 GMT', 'X-Attempt-Number': '1', 'X-Hook-UUID': 'fee8a257-2939-4b3a-aa64-2e07be1a8fb8', 'User-Agent': 'Bitbucket-Webhooks/2.0', 'Content-Type': 'application/json' } event = load_json_file('./tests/fixtures/pullrequest-rejected.json') expected_response = load_json_file( './tests/responses/pullrequest-rejected.json') res = handle_bitbucket_event(event, headers) response = json.loads(res) assert expected_response == response
def test_commit_status_updated_failed(app): headers = { 'X-Request-UUID': '1450daa7-5036-4b25-b24d-13fe76363b25', 'X-Event-Key': 'repo:commit_status_updated', 'X-Event-Time': 'Thu, 11 Jul 2019 14: 36:20 GMT', 'X-Attempt-Number': '1', 'X-Hook-UUID': 'fee8a257-2939-4b3a-aa64-2e07be1a8fb8', 'User-Agent': 'Bitbucket-Webhooks/2.0', 'Content-Type': 'application/json' } event = load_json_file( './tests/fixtures/commit-status-updated-failed.json') expected_response = load_json_file( './tests/responses/commit-status-updated-failed.json') res = handle_bitbucket_event(event, headers) response = json.loads(res) assert expected_response == response
def test_commit_status_updated_successful(app): headers = { 'X-Request-UUID': '01e7f365-6430-4a79-bd5a-976acc8e228e', 'X-Event-Key': 'repo:commit_status_updated', 'X-Event-Time': 'Thu, 11 Jul 2019 15: 01: 11 GMT', 'X-Attempt-Number': '1', 'X-Hook-UUID': 'fee8a257-2939-4b3a-aa64-2e07be1a8fb8', 'User-Agent': 'Bitbucket-Webhooks/2.0', 'Content-Type': 'application/json' } event = load_json_file( './tests/fixtures/commit-status-updated-successful.json') expected_response = load_json_file( './tests/responses/commit-status-updated-successful.json') res = handle_bitbucket_event(event, headers) response = json.loads(res) assert expected_response == response
def __init__(self, dir_path): ''' Constructor Inputs: dir_path: (string) path to the directory that contains the file ''' labels, self.csv = util.load_numpy_array(dir_path, "data.csv") json_full = util.load_json_file(dir_path, "parameters.json") self.name = json_full['name'] self.predictor_vars = json_full['predictor_vars'] self.dependent_var = json_full['dependent_var'] self.training_fraction = json_full['training_fraction'] self.seed = json_full['seed']
def __init__(self, dir_path): ''' Constructor Inputs: dir_path: (string) path to the directory that contains the file ''' # REPLACE pass WITH YOUR CODE self.data = util.load_numpy_array(dir_path, 'data.csv') self.parameters = util.load_json_file(dir_path, 'parameters.json') self.training_data, self.testing_data = train_test_split( self.data[1], test_size=(1 - self.parameters['training_fraction']), random_state=self.parameters['seed'])
def __init__(self, dir_path): ''' Constructor Inputs: dir_path: (string) path to the directory that contains the file ''' self.dir_path = dir_path params_dict = util.load_json_file(self.dir_path, 'parameters.json') self.label, data = util.load_numpy_array(self.dir_path, 'data.csv') self.pred_vars = params_dict['predictor_vars'] self.dep_var = params_dict['dependent_var'] self.X_train, self.X_test, self.y_train, self.y_test = \ train_test_split(data[:,self.pred_vars], data[:,self.dep_var], train_size = params_dict['training_fraction'], random_state = params_dict['seed'])
def prepare(study, portfolio, remote): studyParams = util.load_json_file("study/%s.json" % study) search = build_search(study, portfolio, studyParams) logging.info("Caching %s-%s/search" % (study, portfolio)) cache.put("batch/%s-%s/search" % (study, portfolio), search, remote) batch_ = search['batch_'] target_ = search['target_'] value__ = search['value__'] for batch, value_ in zip(batch_, value__): params = copy.deepcopy(studyParams) del params['shift'] params['portfolioKey'] = "portfolio/%s" % portfolio apply_search(params, target_, value_) params['episodes'].update(epi.build_episodes(params['episodes'])) logging.info("Caching %s" % batch) cache.put("batch/%s/params" % batch, params, remote)
def update_url_id(message_file_name, directory): message_id = message_file_name.split('.')[0] print(message_id) r = tg.call_method('getMessageLink', params={ 'chat_id': chat_id, 'message_id': message_id }) r.wait() if not r.update: return if 'url' not in r.update: return url_id = r.update['url'].split('/')[-1] print('https://t.me/cyclingmarket/{}'.format(url_id)) full_path = os.path.join(directory, message_file_name) data = util.load_json_file(full_path) data['url_id'] = url_id util.save_json_file(full_path, data)
def http_put(ctl, args): if args['--payload']: payload = args['<payload>'] elif args['--file']: payload = util.load_json_file(args['<file>']) status = OperStatus() headers = {'content-type': 'application/yang.data+json', 'accept': 'text/json, text/html, application/xml, */*'} template_url = "http://{}:{}/restconf/{}" url = template_url.format(ctl.ipAddr, ctl.portNum, args['<resource>']) resp = ctl.http_put_request(url, json.dumps(payload), headers) if(resp is None): status.set_status(STATUS.CONN_ERROR) elif(resp.content is None): status.set_status(STATUS.CTRL_INTERNAL_ERROR) elif(resp.status_code == 200 or resp.status_code == 204): status.set_status(STATUS.OK) else: status.set_status(STATUS.HTTP_ERROR, resp) return Result(status, resp)
print(seller_id, "is already in file") # use vk api to load user information # only if this id is unique return sellers_set if __name__ == "__main__": if len(sys.argv) < 2: print("give me directory with .json files with messages from channel") exit(-1) already_got_sellers_set = set() if len(sys.argv) == 3: sellers_json_input = sys.argv[2] sellers = util.load_json_file(sellers_json_input) if sellers: for s in sellers: already_got_sellers_set.add(s['id']) dir_name = sys.argv[1] json_files = glob.glob(os.path.join(dir_name, "messages*.json")) print(json_files) sellers_id_set = set() # f = dir_name + "messages.json" for f in json_files: sellers_id_set.update(parse_json_file(f, already_got_sellers_set)) # print(sellers_id_set)
'с-пб', 'питер', 'санкт-петербург(доставказавашсчёт)', 'спбплощадьвосстания', 'санк-петербург', 'санкт-петербургдоставкакомпаниейсдэкзавашсчетпороссиииснг', 'санктпетербург', 'спб', 'spb', 'петербург', 'санкт-петербург', 'веледетвспбсдэком(пересылвмскзавашсчет)ценазастоковуюкомплектациюгайз', 'санкт-петербург+почта' ] MOSCOW = ['москва-красногорск', 'москва', 'мск'] if __name__ == "__main__": processed = os.listdir(PROCESSED_DIR) cities = set() for post_id in processed: filename = os.path.join(PROCESSED_DIR, post_id, 'data.json') json = util.load_json_file(filename) if isinstance(json['city'], dict): continue city_clear = clear_city_string(json['city']) cities.add(city_clear) if city_clear in SPB: json['city'] = {'id': 2, 'text': json['city']} if city_clear in MOSCOW: json['city'] = {'id': 1, 'text': json['city']} if city_clear == 'ростов-на-дону': json['city'] = {'id': 119, 'text': json['city']} if city_clear == 'великийновгород': json['city'] = {'id': 35, 'text': json['city']} if city_clear == 'вологда': json['city'] = {'id': 41, 'text': json['city']}
def setup_main_db(conn, main_db): write_role = main_db + ":write" # This here is the validation function to control writing to the main database validation_func = ( """ function(newDoc, oldDoc, userCtx){ if((userCtx.roles.indexOf("%s") === -1) && (userCtx.roles.indexOf("admin") === -1) && (userCtx.roles.indexOf("master") === -1) && (userCtx.roles.indexOf("_admin") === -1)){ throw({forbidden: "Not authorized"}); } } """ % write_role ) auth_doc = dict() auth_doc["_id"] = "_design/_auth" auth_doc["validate_doc_update"] = validation_func # Create the main database print("Creating the main Wikifeat database") main_db_url = "/" + main_db conn.request("PUT", main_db_url, headers=ph) resp = conn.getresponse() util.decode_response(resp) if resp.getcode() == 201: print("Main database created.") elif resp.getcode() == 409 or resp.getcode() == 412: print("Main database already exists.") else: print("Error occurred.") sys.exit(-1) # Save the auth document auth_url = main_db_url + "/_design/_auth" conn.request("GET", auth_url, headers=gh) resp = conn.getresponse() addoc = util.decode_response(resp) req_body = "" if resp.getcode() == 404: req_body = json.dumps(auth_doc) elif resp.getcode() == 200: addoc["validate_doc_update"] = validation_func req_body = json.dumps(addoc) if len(req_body) > 1: conn.request("PUT", auth_url, body=req_body, headers=ph) resp = conn.getresponse() util.decode_response(resp) if resp.getcode() == 201: print("Main auth doc successfully updated.") else: print("Main auth doc update failed.") # Now load the main db security document sec_url = main_db_url + "/_security" main_sec = util.load_json_file(os.path.join(wf_dir, "scripts/ddoc/main_access.json")) req_body = json.dumps(main_sec) conn.request("PUT", sec_url, body=req_body, headers=ph) resp = conn.getresponse() util.decode_response(resp) if resp.getcode() == 200 or resp.getcode() == 201: print("Main security doc saved successfully.") else: print("Main security doc save failed.") # Now save the main db design doc main_ddoc_url = main_db_url + "/_design/wiki_query" conn.request("GET", main_ddoc_url, headers=gh) resp = conn.getresponse() existing_ddoc = util.decode_response(resp) main_ddoc = util.load_json_file(os.path.join(wf_dir, "scripts/ddoc/main_ddoc.json")) if resp.getcode() == 200: # Set the rev so we can update print("Main design doc exists. Updating.") main_ddoc["_rev"] = existing_ddoc["_rev"] req_body = json.dumps(main_ddoc) conn.request("PUT", main_ddoc_url, body=req_body, headers=ph) resp = conn.getresponse() util.decode_response(resp) if resp.getcode() == 200 or resp.getcode() == 201: print("Main design doc saved successfully") else: print("Main design doc save failed")
def setup_main_db(conn, main_db): write_role = main_db + ":write" # This here is the validation function to control writing to the main database validation_func = """ function(newDoc, oldDoc, userCtx){ if((userCtx.roles.indexOf("%s") === -1) && (userCtx.roles.indexOf("admin") === -1) && (userCtx.roles.indexOf("master") === -1) && (userCtx.roles.indexOf("_admin") === -1)){ throw({forbidden: "Not authorized"}); } } """ % write_role auth_doc = dict() auth_doc["_id"] = "_design/_auth" auth_doc["validate_doc_update"] = validation_func # Create the main database print("Creating the main Wikifeat database") main_db_url = '/' + main_db conn.request("PUT", main_db_url, headers=ph) resp = conn.getresponse() util.decode_response(resp) if resp.getcode() == 201: print("Main database created.") elif resp.getcode() == 409 or resp.getcode() == 412: print("Main database already exists.") else: print("Error occurred.") sys.exit(-1) # Save the auth document auth_url = main_db_url + '/_design/_auth' conn.request("GET", auth_url, headers=gh) resp = conn.getresponse() addoc = util.decode_response(resp) req_body = "" if resp.getcode() == 404: req_body = json.dumps(auth_doc) elif resp.getcode() == 200: addoc['validate_doc_update'] = validation_func req_body = json.dumps(addoc) if len(req_body) > 1: conn.request("PUT", auth_url, body=req_body, headers=ph) resp = conn.getresponse() util.decode_response(resp) if resp.getcode() == 201: print("Main auth doc successfully updated.") else: print("Main auth doc update failed.") # Now load the main db security document sec_url = main_db_url + '/_security' main_sec = util.load_json_file( os.path.join(wf_dir, "scripts/ddoc/main_access.json")) req_body = json.dumps(main_sec) conn.request("PUT", sec_url, body=req_body, headers=ph) resp = conn.getresponse() util.decode_response(resp) if resp.getcode() == 200 or resp.getcode() == 201: print("Main security doc saved successfully.") else: print("Main security doc save failed.") # Now save the main db design doc main_ddoc_url = main_db_url + '/_design/wiki_query' conn.request("GET", main_ddoc_url, headers=gh) resp = conn.getresponse() existing_ddoc = util.decode_response(resp) main_ddoc = util.load_json_file( os.path.join(wf_dir, "scripts/ddoc/main_ddoc.json")) if resp.getcode() == 200: # Set the rev so we can update print("Main design doc exists. Updating.") main_ddoc['_rev'] = existing_ddoc['_rev'] req_body = json.dumps(main_ddoc) conn.request("PUT", main_ddoc_url, body=req_body, headers=ph) resp = conn.getresponse() util.decode_response(resp) if resp.getcode() == 200 or resp.getcode() == 201: print("Main design doc saved successfully") else: print("Main design doc save failed")
import pandas from util import load_json_file, get_data_with_code, load_csv, get_value_from_json from nlp import preprocess, get_jaccard_sim, get_intersection import spacy import numpy as np nlp = spacy.load('en') df = load_csv("data/output.csv", "|") # print(df) industry_json = load_json_file("sasb_mm_industry.json") threat_json = load_json_file("sasb_mm_threats.json") sub_df, sec_data_list = get_data_with_code("sasb", df, "Internet Media & Services") # print(sec_data_list[0]) text_data = [] for data in sec_data_list: tokens = preprocess(data) # print(tokens) text_data.append(tokens) threat_desc = [] threat_name = [] for threat in threat_json: # print(threat["Threat"]) for obj in threat["SubThreats"]: # doc2 = nlp(obj["Description"]) # print(obj["SubThreat"])
def main(args): try: logging.info('(function {}) Started'.format(main.__name__)) source_files = UTIL.parse_source_files(args.data_path, args.from_files, logging) source_file = source_files['source'] destination_file = os.path.join(args.data_path, args.from_format.lower() + '_to_' + args.to_format.lower() + '_'+args.to_file_name) # TODO: 1) We need to create a interface class to have the same signature for all the formatters in ds_formatter folder. # TODO: 2) We need to create a generic approach to convert any type to any type not only any type to squad. # TODO: 3) can we have better approach to handle the following if/else scenarios # TODO: 4) we may also put some kind of field wrapper to handle whether which fields are gonna be filled with dummy and which fields are gonna be filled with real values. if args.from_format.lower() == 'qangaroo' and args.to_format.lower() == 'squad' : """ --log_path="~/log.log" --data_path="~/data/qangaroo_v1.1/wikihop" --from_files="source:dev.json" --from_format="qangaroo" --to_format="squad" --to_file_name="dev.json" #it is gonna be renamed as "[from_to]_filename.what" """ in_content = UTIL.load_json_file(source_file, logging) formatted_content = qangaroo.convert_to_squad(in_content) UTIL.dump_json_file(destination_file, formatted_content, logging) elif args.from_format.lower() == 'mctest' and args.to_format.lower() == 'squad': """ --log_path="~/log.log" --data_path="~/data/" --from_files="source:mc160.dev.tsv" --from_format="mctest" --to_format="squad" --to_file_name="mc160.dev.json" #it is gonna be renamed as "[from_to]_filename.what" """ story_question_content = UTIL.load_csv_file(source_file,"\t", None, logging) #answer_content = UTIL.load_csv_file(additional_files['answer'], "\t", None, logging) formatted_content = mctest.convert_to_squad(story_question_content) UTIL.dump_json_file(destination_file, formatted_content, logging) elif args.from_format.lower() == 'insuranceqa' and args.to_format.lower() == 'squad': """ --log_path="~/log.log" --data_path="~/data/insuranceqa_v2" --from_files="source:InsuranceQA.question.anslabel.token.1500.pool.solr.test.encoded,voc:vocabulary.txt,answer:InsuranceQA.label2answer.token.encoded" --from_format="insuranceqa" --to_format="squad" --to_file_name="1500.test.json" """ voc = insuranceqa.load_vocab(source_files['voc']) questions, a_to_q_map = insuranceqa.load_questions(source_file, voc) answers = insuranceqa.load_answers(source_files['answer'], voc) formatted_content = insuranceqa.convert_to_squad(questions, answers, a_to_q_map) UTIL.dump_json_file(destination_file, formatted_content, logging) elif args.from_format.lower() == 'triviaqa' and args.to_format.lower() == 'squad': """ --log_path="~/log.log" --data_path="~/data/triviaqa/" --from_files=""source:qa/wikipedia-train.json, wikipedia:evidence/wikipedia,web:evidence/web,seed:10,token_size:2000,sample_size:1000000" --from_format="triviaqa" --to_format="squad" --to_file_name="wikipedia-train-long.json" """ wiki = source_files['wikipedia'] web = source_files['web'] seed = source_files['seed'] max_num_of_tokens = source_files['token_size'] sample_size = source_files['sample_size'] qa_file = UTIL.load_json_file(source_file, logging) formatted_content = triviaqa.convert_to_squad_format(qa_file, wiki, web, sample_size, seed, max_num_of_tokens) UTIL.dump_json_file(destination_file, formatted_content, logging) elif args.from_format.lower() == 'wikiqa' and args.to_format.lower() == 'squad': """ --log_path="~/log.log" --data_path="~/data/WikiQACorpus" --from_files="source:WikiQA-dev.tsv" --from_format="wikiqa" --to_format="squad" --to_file_name="dev.json" """ story_question_content = UTIL.load_csv_file(source_file, "\t", 'infer', logging) formatted_content = wikiqa.convert_to_squad(story_question_content) UTIL.dump_json_file(destination_file, formatted_content, logging) elif args.from_format.lower() == 'squad' and args.to_format.lower() == 'matchzoo': """ **sample.txt**: Each line is the raw query and raw document text of a document. The format is "label \t query \t document_txt". --log_path="~/log.log" --data_path="~/data/squad" --from_files="source:dev-v1.1.json,q_len:1000,negative_sampling:100" --from_format="squad" --to_format="matchzoo" --to_file_name="dev.txt" """ negative_samp_count = int(source_files['negative_sampling']) q_len = int(source_files['q_len']) content = UTIL.load_json_file(source_file, logging) generator = squad.yield_to_matchzoo(content, q_len, negative_samp_count) open(destination_file, "w").write('\n'.join(data for data in generator)) #UTIL.dump_json_file(destination_file, formatted_content, logging) elif args.from_format.lower() == 'squad' and args.to_format.lower() == 'lucene': """ **sample.txt**: Each line is the raw query and raw document text of a document. The format is "label \t query \t document_txt". --log_path="~/log.log" --data_path="~/data/squad" --from_files="source:dev-v1.1.json,doc_type_verbose:2" --from_format="squad" --to_format="matchzoo" --to_file_name="dev.txt" """ doc_type_verbose = int(source_files['doc_type_verbose']) content = UTIL.load_json_file(source_file, logging) squad.convert_to_lucene(content, doc_type_verbose, args.data_path) elif args.from_format.lower() == 'squad' and args.to_format.lower() == 'short_squad': """ **sample.txt**: Each line is the raw query and raw document text of a document. The format is "label \t query \t document_txt". --log_path="~/log.log" --data_path="~/data/squad" --from_files="source:dev-v1.1.json,q_len:1000,negative_sampling:100" --from_format="squad" --to_format="short_squad" --to_file_name="dev.json" """ negative_samp_count = int(source_files['negative_sampling']) q_len = int(source_files['q_len']) content = UTIL.load_json_file(source_file, logging) formatted_content = squad.convert_to_short_squad(content, q_len, negative_samp_count) UTIL.dump_json_file(destination_file, formatted_content, logging) elif args.from_format.lower() == 'squad' and args.to_format.lower() == 'squad': """ In order to make some analyzes. --log_path="~/log.log" --data_path="~/data/squad" --from_files="source:dev-v1.1.json,is_histogram:True,document_type:1" #1 for question, #2 for paragraphs, #3 for both. --from_format="squad" --to_format="squad" --to_file_name="dev.json" """ is_historgram = source_files['is_histogram'] document_type = int(source_files['document_type']) his_bin = int(source_files['histogram_bin']) content = UTIL.load_json_file(source_file, logging) squad.print_statistics(content, is_historgram, his_bin, document_type) elif args.from_format.lower() == 'narrativeqa' and args.to_format.lower() == 'squad': """ --log_path="~/log.log" --data_path="~/data/narrativeqa" --from_files="source:summaries.csv,set:train,qaps:qaps.csv" --from_format="narrativeqa" --to_format="squad" --to_file_name="train.json" #it is gonna be renamed as "[from_to]_filename.what" """ story_summary_content = UTIL.load_csv_file(source_file, ",", 'infer', logging) question_content = UTIL.load_csv_file(source_files['qaps'], ",", 'infer', logging) set_type = source_files['set'] formatted_content = narrativeqa.convert_to_squad(story_summary_content, question_content, set_type) UTIL.dump_json_file(destination_file, formatted_content, logging) elif args.from_format.lower() == 'webqa' and args.to_format.lower() == 'squad': " ************************************************************ " " *********************** ON-HOLD *****************************" " ************************************************************ " """ --log_path="~/log.log" --data_path="~/data/" --from_files="label:question.train.token_idx.label,voc:vocabulary,answer:answers.label.token_idx" --from_format="webqa" --to_format="squad" --to_file_name="filename.what" #it is gonna be renamed as "[from_to]_filename.what" """ story_summary_content = UTIL.load_csv_file(source_file, ",", 'infer', logging) question_content = UTIL.load_csv_file(source_files['qaps'], ",", 'infer', logging) set_type = source_files['set'] formatted_content = narrativeqa.convert_to_squad(story_summary_content, question_content, set_type) UTIL.dump_json_file(args.destination_file_path, formatted_content, logging) elif args.from_format.lower() == 'msmarco' and args.to_format.lower() == 'squad': """ --log_path="~/log.log" --data_path="~/data/msmarco" --from_format="msmarco" --to_format="squad" --to_file_name="dev_2.1.json" #it is gonna be renamed as "[from_to]_filename.what" """ input_dict = {} try: version = float(source_files['v']) except: version = 2.0 input_dict['v'] = version if version <= 2.0: """ for version <= 2.0 --from_files="source:dev_2.1.json, v:2.0" """ in_content = UTIL.load_json_file(source_file, logging) input_dict['story_question_content'] = in_content formatted_content = msmarco.convert_to_squad(in_content) else: """ for version > 2.0 --from_files="source:queries.train.csv,document:collection.tsv,mapping:qrels.train.csv,v:2.1,limit:-1" """ queries = UTIL.load_csv_file(source_file, "\t", None, logging, ['id', 'content']) input_dict['queries'] = queries mappings = UTIL.load_csv_file(source_files['mapping'], "\t", None, logging, ['q_id', 'tmp1', 'p_id', 'tmp2'], [0,1,2,3]) input_dict['mappings'] = mappings documents = UTIL.load_csv_file(source_files['document'], "\t", None, logging, ['id', 'content']) input_dict['documents'] = documents input_dict['limit'] = int(source_files['limit']) formatted_content = msmarco.convert_to_squad(input_dict) UTIL.dump_json_file(destination_file, formatted_content, logging) elif args.from_format.lower() == 'quasar' and args.to_format.lower() == 'squad': """ --log_path="~/log.log" --data_path="~/data/quasar-t" --from_format="quasar-t" --to_format="squad" --from_files="source:train_questions.json,document:train_contexts.json,type:t,is_null_tags_filter, limit:-1" --to_file_name="train.json" """ if source_files['type'].lower() =='t': # quasar-t queries = UTIL.load_json_line_file(source_file, logging) documents = UTIL.load_json_line_file(source_files['document'], logging) formatted_content = quasar.convert_to_squad(queries, documents, source_files['is_null_tags_filter'], int(source_files['limit'])) UTIL.dump_json_file(destination_file, formatted_content, logging) elif args.from_format.lower() == 'ubuntu' and args.to_format.lower() == 'squad': """ --log_path="~/log.log" --data_path="~/data/ubuntu" --from_files="source:valid.csv" --from_format="ubuntu" --to_format="squad" --to_file_name="valid.json" """ story_question_content = UTIL.load_csv_file(source_file, ",", 'infer', logging) formatted_content = ubuntudialogue.convert_to_squad(story_question_content) UTIL.dump_json_file(destination_file, formatted_content, logging) elif args.from_format.lower() == 'newsqa' and args.to_format.lower() == 'squad': """ --log_path="~/log.log" --data_path="~/data/newsqa" --from_files="source:newsqa-data-v1.csv,story:cnn_stories/" --from_format="newsqa" --to_format="squad" --to_file_name="news.json" """ story_question_content = UTIL.load_csv_file(source_file, ",", 'infer', logging) context_content_path = source_files['story'] formatted_content = cnnnews.convert_to_squad(story_question_content, context_content_path) UTIL.dump_json_file(destination_file, formatted_content, logging) else: pass logging.info('(function {}) Finished'.format(main.__name__)) except Exception as e: logging.error('(function {}) has an error: {}'.format(main.__name__, e)) raise
if __name__ == "__main__": if len(sys.argv) < 2: print("give me directory with .json files with messages from channel") exit(-1) dir_name = sys.argv[1] messages = glob.glob(os.path.join(dir_name, "messages*.json")) regexp_hash = re.compile("hash") messages = [x for x in messages if not regexp_hash.search(x)] # messages = glob.glob(os.path.join(dir_name, "messages25.json")) for messages_json_filename in messages: print(messages_json_filename) goods = util.load_json_file(messages_json_filename) goods_count = len(goods) i = 0 for g in goods: if len(g['seller']) <= 17: continue photo_link_jpg = g['photo_link'] photo_hash = get_photo_hash(photo_link_jpg) g['hash'] = photo_hash print(i, "/", goods_count, " ", photo_hash) i += 1 json_filename = os.path.splitext( messages_json_filename)[0] + "_hash.json" util.save_json_file(json_filename, goods) os.remove(messages_json_filename)
def process_singles(): singles = os.listdir(SINGLE_DIR) for fname in singles: fname = SINGLE_DIR + fname data = util.load_json_file(fname) if 'content' not in data: continue if 'caption' not in data['content']: continue content = data['content'] caption = content['caption'] text = caption['text'] prod_caption_ent = None prod_price_ent = None prod_seller_ent = None prod_descr_ent = None hashtag_ents = [] entities = caption['entities'] for e in entities: entity_type = e['type']['@type'] if entity_type == 'textEntityTypeHashtag': hashtag_ents.append(e) if entity_type == 'textEntityTypeBold': if not prod_caption_ent: prod_caption_ent = e else: prod_price_ent = e if entity_type == 'textEntityTypeItalic': prod_descr_ent = e if entity_type == 'textEntityTypeMentionName': prod_seller_ent = e if prod_caption_ent is None or prod_price_ent is None or prod_seller_ent is None or prod_descr_ent is None: continue product_hashtags = [] for h in hashtag_ents: product_hashtags.append(get_from_text(text, h)) product_caption = get_from_text(text, prod_caption_ent) product_descr = get_from_text(text, prod_descr_ent) product_price = get_from_text(text, prod_price_ent) product_seller_name = get_from_text(text, prod_seller_ent) product_city = get_city_from_text(text, prod_price_ent, prod_seller_ent) product_seller_id = prod_seller_ent['type']['user_id'] photo_file_id = content['photo']['sizes'][-1]['photo']['remote']['id'] r = tg.call_method('getUser', params={'user_id': product_seller_id}) r.wait() seller = r.update product = { 'hashtags': product_hashtags, 'caption': product_caption, 'descr': product_descr, 'price': product_price, 'city': product_city, 'seller': { 'id': product_seller_id, 'full_name': product_seller_name, 'username': seller['username'], 'first_name': seller['first_name'], 'last_name': seller['last_name'], 'profile_photo': seller.get('profile_photo', None), }, 'photo': photo_file_id, 'date': data['date'] } url_id = data['url_id'] pr_dir = os.path.join(PROCESSED_DIR, url_id) create_dir(pr_dir) util.save_json_file(os.path.join(pr_dir, 'data.json'), product) print(product)
connection.commit() if __name__ == "__main__": if len(sys.argv) < 3: print( "give me directory with .json files (with hash) with messages from channel and channel name" ) exit(-1) dir_name = sys.argv[1] tg_channel = sys.argv[2] sellers_filename = os.path.join(dir_name, "sellers.json") sellers = util.load_json_file(sellers_filename) sellers_to_mysql(sellers) exit(0) messages = glob.glob(os.path.join(dir_name, "messages*hash.json")) for messages_json_filename in messages: print(messages_json_filename) goods = util.load_json_file(messages_json_filename) for g in goods: if len(g['seller']) <= 17: continue seller_id = int(g['seller'][17:]) description = g['description']
print "run : train, validate, test and report" print "track : track progress of jobs (remote only)" print "review : review performance of jobs (remote only)" print "dump : display cache item(s)" print "export : copy cache item(s) to clipboard" print "clear : clear cache item(s)" print "quit : quit" print "? : display help" elif (action == "portfolio"): portfolio = util.get_str_input("portfolio (%s) : " % portfolio, portfolio) elif (action == "study"): study = util.get_str_input("study (%s) : " % study, study) elif (action == "batches"): batches = util.get_str_input("batches (%s) : " % batches, batches) elif (action == "create"): portfolioParams = util.load_json_file("portfolio/%s.json" % portfolio) aPortfolio = ptf.Portfolio(portfolioParams) print "caching %s" % portfolio cache.put('portfolio/%s' % portfolio, aPortfolio, remote) elif (action == "remote"): remote = not remote elif (action == "debug"): debug = not debug elif (action == "pvdebug"): pvdebug = not pvdebug print pvdebug logging.getLogger().setLevel(level = logging.DEBUG if pvdebug else logging.INFO) elif (action == "prepare"): batcher.prepare(study, portfolio, remote) elif (action == "train"): batch_ = batcher.interpret_batches(study, portfolio, batches, remote)