def api_issue(): log_request(request, 'api-issue') # prepare constants headers = {'Content-Type': 'application/json; charset=utf-8'} bad_payload = { 'success': False, 'num': None, 'url': None, } bad_request_response = (dump_to_json(bad_payload), 400, headers) # get and validate user's data valid, why = is_issue_request_valid(request) if not valid: log_request(request, 'api-issue-errors', why) return bad_request_response user_data = request.get_json() user_data = escape_sel_context(user_data) # get template exp_keys = {'owner', 'repo', 'title', 'body', 'labels', 'assignees'} issue_tmpl = get_config('issue_template', exp_keys) # specify template as_is_keys = {'owner', 'repo', 'labels', 'assignees'} issue = {k: issue_tmpl[k] for k in as_is_keys} issue['title'] = issue_tmpl['title'].format_map(user_data) issue['body'] = '' line_prefix = '' for line in issue_tmpl['body']: if isinstance(line, dict): line_prefix = line['line_prefix'] else: line = line.format_map(user_data) line = ('\n' + line_prefix).join(line.split('\n')) issue['body'] += line_prefix + line issue['body'] = issue['body'].rstrip('\n') # create issue try: num, url = gitHub.create_issue(**issue) payload = { 'success': True, 'num': num, 'url': url, } except: why = 'The exception raised in api_issue:\n' + format_exc() app.logger.warning(why) log_request(request, 'api-issue-errors', why) return bad_request_response return (dump_to_json(payload), 200, headers)
def main(): docs = {} batch = 1 for fname in glob.glob('text/*/wiki*', recursive=True): print(fname) with open(fname) as f: in_doc = False cur_doc = {} cur_lines = [] for line in f: if not in_doc: if line.startswith('<doc id="'): in_doc = True doc_id, title = extract_title_id(line) cur_doc['id'] = doc_id cur_doc['title'] = clean_title(title) continue if line.startswith('</doc>'): doc_id = cur_doc['id'] del cur_doc['id'] text = ''.join(cur_lines) cats = RE_CAT.findall(text) cats = [c.split('|')[0].strip() for _, c, _ in cats] if cats: cur_doc['cats'] = cats is_disam = any(disam in text for disam in DISAMS) if is_disam: cur_doc['dis'] = 1 docs[doc_id] = cur_doc in_doc = False cur_doc = {} cur_lines = [] else: cur_lines.append(line) if len(docs) >= 100000: dump_to_json(docs, 'expanded/expanded_{}.json'.format(batch)) docs = {} batch += 1 if docs: dump_to_json(docs, 'expanded/expanded_{}.json'.format(batch)) docs = {}
def process_file(filename, processor): all_projects = [] workbook = spreadsheet.WorkBook(filename) for sheet in workbook.sheets(): #yesno = raw_input("Process this sheet? ") #if yesno.lower() == "y": #if "(ONGOING)" in sheet.name or "(COMPLETE" in sheet.name: for label in ["ONGOING", "ON GOING", "ON-GOING", "ON-GO"]: if label in sheet.name.upper(): projects = process_sheet(sheet, processor) all_projects.extend(projects) break #else: # print sheet.name print utils.dump_to_json(all_projects)
def log_request(request, name, why=None): """ Log request with particular logger (to corresponding file) """ logger = logging.getLogger(LOGGER_NAME_PREFIX + name) logger.info('=================') logger.info('Method: {method}'.format(method=request.method)) # headers from WSGI environment are not sorted headers = dump_to_json(dict(request.headers)) logger.info('---- Headers ----\n' + headers) if request.is_json: json = dump_to_json(request.get_json()) logger.info('---- JSON ----\n' + json) else: data = request.get_data().decode() logger.info('---- Data ----\n' + data) if why: logger.info('---- Why bad ----\n' + why)
def main(data_dir): redirects = {} batch = 1 for fname in glob.glob(data_dir + '/*/wiki*', recursive=False): print(fname) with open(fname) as f: in_doc = False cur_doc = {} cur_lines = [] for line in f: if not in_doc: if line.startswith('<doc id="'): in_doc = True doc_id, title = extract_title_id(line) cur_doc['id'] = doc_id cur_doc['title'] = clean_title(title) continue if line.startswith('</doc>'): doc_id = cur_doc['id'] del cur_doc['id'] for cur_line in cur_lines: m = RE_REDIRECT.search(cur_line) if m: cur_doc['redirect'] = m.group(1) break if 'redirect' in cur_doc: redirects[doc_id] = cur_doc in_doc = False cur_doc = {} cur_lines = [] else: cur_lines.append(line) if len(redirects) >= 100000: dump_to_json(redirects, 'expanded/expanded_{}.json'.format(batch)) redirects = {} batch += 1 if redirects: dump_to_json(redirects, 'expanded/expanded_{}.json'.format(batch)) redirects = {}
def edit(cls, uuid): data = connection.get('/project/%s/edit' % (uuid)) if not data: details = Project.get(uuid, as_json=True) data = dump_to_json(details) connection.set('/project/%s/edit' % (uuid), data) else: details = json.loads(data) project = cls(details) project.edit = True return project
def save(self): uuid = self._uuid timestamp = str(uuid1()) self._details['_uuid'] = uuid self._details['_timestamp'] = timestamp data = dump_to_json(self._details) if self.edit: connection.set('/project/%s/edit' % (uuid), data) else: connection.sadd('/project', uuid) connection.sadd('/project/%s' % (uuid), timestamp) connection.set('/project/%s/%s' % (uuid, timestamp), data)
def main(args): filename = args.input_file with open(filename, "rb") as f: html_string = f.read() filename = remove_path(filename) g = Goose() article = g.extract(raw_html=html_string) data = {} data["text"] = text data["id"] = filename data = dump_to_json(data) return data
def main(args): filename = args.input_file if args.no_byte: with open(filename, "r") as f: html_string = f.read() else: with open(filename, "rb") as f: html_string = f.read() filename = remove_path(filename) extractor = Extractor(extractor='ArticleExtractor', html=html_string) extracted_text = extractor.getText() data = {} data["text"] = extracted_text data["id"] = filename data = dump_to_json(data) return data
stdout = stdout.decode('utf-8').replace('\n', '\t').split('\t') return stdout[1] except CalledProcessError as e: print(e) if __name__ == "__main__": jardir = "/emw_pipeline_nf/bin/DTC_Nextflow" args = get_args() with open("asd", "w") as f: f.write(args.data) data = load_from_json(args.data) filename = args.input_dir + "/" + data["id"] if args.no_byte: with open(filename, 'r') as fr: html_string = str(fr.read()) else: with open(filename, 'rb') as fr: html_string = str(fr.read()) compileCmd = 'javac -cp .:{0}/dct-finder-2015-01-22.jar:{0}/commons-lang3-3.8.1.jar:{0}/commons-cli-1.4.jar {0}/main.java -d .'.format( jardir) excuteCmd = 'java -cp .:{0}/dct-finder-2015-01-22.jar:{0}/commons-lang3-3.8.1.jar:{0}/commons-cli-1.4.jar DTC_Nextflow.main {1}'.format( jardir, html_string) compiled = compile_java(compileCmd) pd = execute_java(excuteCmd) data["publish_time"] = pd print(dump_to_json(data))
def retrieve_prices_by_compare(all_styles, compare_url, years, out_dir): ''' - all_styles {make: models} - models [model1, model2, ...] - model {name: 'name', year: styles} - styles [style1, style2, ...] - style {'name', 'id', 'attr1', 'attr2', ...} (this function adds 'price' attr to it) ''' if not os.path.exists(out_dir): os.makedirs(out_dir) for make in all_styles: models = all_styles[make] for model in models: model_name = model.get('name') if model_name is None: cprint("Empty model name in list for make {}".format(make), 'err') continue for year in years: styles = model.get(year) if styles is None: continue for style in styles: vehicle_id = style.get('id') if vehicle_id is None: if style.get('name') is None: cprint( "Empty vehicle id for {} {} {} UNKNOWN TYPE". format(year, make, model_name), 'r') else: cprint( "Empty vehicle id for {} {} {} style {}". format(year, make, model_name, style['name']), 'r') continue url = '{}{}-{}-{}-{}/'.format(compare_url, year, make, model_name, vehicle_id) try: page = urllib.urlopen(url) except: cprint("Failed to open url: {}".format(url), 'err') continue soup = BeautifulSoup(page, 'lxml') if page_not_found(soup): cprint( "Compare page for {} {} {} {} does not exist". format(year, make, model_name, vehicle_id), 'r') continue for td in soup.find_all('td', {'class': ''}): spans = td.find_all('span') if len(spans) == 2 and spans[ 0].text == 'KBB Suggested Retail': style['price'] = spans[1].text cprint( "Suggested price {} for {} {} {} style {}". format(style['price'], year, make, model_name, style['name']), 'g') cprint("Saving data for make {}".format(make), 'hi') out_file = out_dir + make + '.json' dump_to_json(all_styles[make], out_file)
def main(args): # Source 1 times # Source 2 newind # Source 3 ind # Source 4 thehin # Source 5 scm # Source 6 people data = load_from_json(args.data) filename = args.input_dir + "/" + data["id"] with open(filename, "rb") as g: html_string = g.read() text = data["text"].splitlines() stoplist1 = None stoplist2 = None stoplist3 = None stoplist4 = None if args.source == 1: text = deletesamesubstr(text) stoplist1 = [ "RELATED", "From around the web", "More from The Times of India", "Recommended By Colombia", "more from times of india Cities", "You might also", "You might also like", "more from times of india", "All Comments ()+^ Back to Top", "more from times of india News", "more from times of india TV", "more from times of india Sports", "more from times of india Entertainment", "more from times of india Life & Style", "more from times of india Business" ] stoplist2 = ["FOLLOW US", "FOLLOW PHOTOS", "FOLLOW LIFE & STYLE"] elif args.source == 3: stoplist1 = [ "Tags:", "ALSO READ", "Please read our before posting comments", "TERMS OF USE: The views expressed in comments published on indianexpress.com are those of the comment writer's alone. They do not represent the views or opinions of The Indian Express Group or its staff. Comments are automatically posted live; however, indianexpress.com reserves the right to take it down at any time. We also reserve the right not to publish comments that are abusive, obscene, inflammatory, derogatory or defamatory." ] elif args.source == 4: stoplist3 = [ "ShareArticle", "Updated:", "MoreIn", "SpecialCorrespondent", "METRO PLUS", "EDUCATION PLUS", "PROPERTY PLUS", "CINEMA PLUS", "DISTRICT PLUS" ] stoplist4 = [ "METRO PLUS", "EDUCATION PLUS", "PROPERTY PLUS", "CINEMA PLUS", "DISTRICT PLUS" ] elif args.source == 5: stoplist1 = ["Print Email", "Video"] stoplist2 = [ "Viewed", "Associated Press", "Get updates direct to your inbox", "Opinion" ] elif args.source == 6: stoplist2 = [ 'Email | Print', '+', 'stumbleupon', 'More Pictures', 'Save Article', 'Click the "PLAY" button and listen. Do you like the online audio service here?', 'Good, I like it', 'Do you have anything to say?', 'Name' ] text = [line for line in text if not line.startswith("Source")] if text: text = deletecertainstr(text, stoplist1=stoplist1, stoplist2=stoplist2, stoplist3=stoplist3) if text: text, data = addnewstime(text, html_string, data, args.source, stoplist=stoplist4) if args.source == 1: text = deletesamesubstr(text) if text: text = "".join([ line.strip() + "\n" if line.strip() != "" else "" for line in text ])[:-1] data["text"] = text data = dump_to_json(data) return data
parser.add_argument('--out_dir', help="output folder") args = parser.parse_args() return (args) def request(id, text): r = requests.post(url="http://localhost:5000/queries", data={ 'identifier': id, 'text': text }, json={"Content-Type": "application/json"}) return json.loads(r.text) if __name__ == "__main__": args = get_args() data = load_from_json(args.data) rtext = request(id=data["id"], text=data["text"]) data["doc_label"] = int(rtext["output"]) data["length"] = len(data["text"]) if data["doc_label"] == 0: write_to_json(data, data["id"], extension="json", out_dir=args.out_dir) else: data["sentences"] = rtext["event_sentences"] print(dump_to_json(data, add_label=True))
def replace_old_testing_json(raw_orders, json_fname: str): '''deletes old json, exports raw orders to json file''' output_dir = get_output_dir(client_file=False) json_path = os.path.join(output_dir, json_fname) delete_file(json_path) dump_to_json(raw_orders, json_fname)
dataset = loader.load(dataset_name, encoding='utf8', batch_size=params.batch_size, to_tensor=True, to_cuda=params.cuda) logger.info("- done.") # add datasets parameters into params params.update(datasets_params) # create model, optimizer and so on. model, optimizer, criterion, metrics = model_factory(params) # restore model, optimizer status = Serialization(checkpoint_dir=model_dir).restore( model=model, checkpoint=checkpoint) if not status: logger.error("Restore model from the checkpoint: {}, failed".format( checkpoint)) logger.info("Starting evaluate model on test dataset...") metrics_result = evaluate(model, dataset, criterion, metrics) logger.info("- done.") logger.info("Save metrics results...") metrics_file = os.path.join(model_dir, metrics_filename.format(checkpoint)) dump_to_json(metrics_result, metrics_file) logger.info("- done.")