def get_movie_info(body): body_line = body.split('\n') it = iter(body_line) lists = [] count = 1 name, grade, review_num, quote = '', '', '', '' while True: try: line = next(it) if 'class="title"' in line: name = get_text(line) next(it) count += 1 elif 'class="rating_num"' in line: grade = get_text(line) next(it) line = next(it) review_num = get_text(line) count += 2 elif 'class="inq"' in line: quote = get_text(line) count += 1 except StopIteration: # 遇到StopIteration就退出循环 break if count % 5 == 0: item = [name, grade, review_num, quote] count = 1 lists.append(item) return lists
def read_page(page): html = read_html(page.url) soup = Soup(html) text_top = get_text(soup.find('div', id='maegaki')) print(text_top.count('\n')) text_mid = get_text(soup.find('div', id='honbun')) text_bot = get_text(soup.find('div', id='atogaki')) texts = [text for text in (text_top, text_mid, text_bot) if text] story = ''' ──────────────────────────────── '''.join(texts) text = '''──────────────────────────────── ◆ {} ──────────────────────────────── {}'''.format(page.title, story) return text
def play_audio(self, item, title): print(f'点开了视频《{title}》') get_duration = lambda: utils.get_text(self.driver, item + '/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.TextView') or \ utils.get_text(self.driver, item + '/android.view.ViewGroup/android.widget.LinearLayout[2]/android.widget.TextView') duration_text = get_duration() if not duration_text: utils.swipe_up(self.driver) duration_text = get_duration() duration_text = duration_text.split(':') duration = int(duration_text[0]) * 60 + int(duration_text[1]) print(f'这个视频要看:{duration}秒') utils.click(self.driver, item) while True: start = time.time() time.sleep(1) self.wait(duration) dots = 1 replay = '/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout[2]/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.LinearLayout[1]/android.widget.FrameLayout/android.widget.FrameLayout[2]/android.widget.FrameLayout/android.widget.FrameLayout[3]/android.widget.FrameLayout[2]/android.widget.FrameLayout/android.widget.LinearLayout' while not utils.find_element(self.driver, replay): print('再等会' + dots * '.', end='\r', flush=True) dots = max((dots + 1) % 6, 1) time.sleep(1) print('视频放完了') self.audiuo_time -= time.time() - start if self.audio_counter == 1 and self.audiuo_time > 0: print(f'虽然视频看完了,但是还剩下{int(self.audiuo_time)}秒,所以重看一次') time.sleep(random.uniform(0.8, 1.8)) if utils.click(self.driver, replay): self.wait(duration) else: break self.audio_counter -= 1 self.audio.append(title) print(f'看完惹,还有{self.audio_counter}个要看') self.driver.back()
def run_inference(self, image): """ Performs the inferencing and OCR :param image: An image(numpy array) to perform inferencing on :return: List of text detected, returns empty list of no object is detected """ input_shape = self.get_input_shape() preprocess_function = lambda img: quantize((img / 127.5) - 1, 128, 127) out_list = self.invoke(image, preprocess_function) # model outputs 3 tensors now, normalized to between 0 and 1 score_map = dequantize(out_list[0], 128, 127) geo_loc_map = dequantize(out_list[1], 128, 127) geo_angle = dequantize(out_list[2], 128, 127) score_map = (score_map + 1) * 0.5 geo_loc_map = (geo_loc_map + 1) * 256 geo_angle = 0.7853981633974483 * geo_angle geo_map = np.concatenate((geo_loc_map, geo_angle), axis=3) boxes = text_detection(score_map=score_map, geo_map=geo_map) if boxes is not None: boxes = boxes[:, :8].reshape((-1, 4, 2)) boxes[:, :, 0] /= input_shape[1] / image.shape[0] boxes[:, :, 1] /= input_shape[2] / image.shape[1] output_text = [] for box in boxes: box = sort_poly(box.astype(np.int32)) if np.linalg.norm(box[0] - box[1]) < 5 or np.linalg.norm(box[3] - box[0]) < 5: continue (x_max, x_min), (y_max, y_min) = xy_maxmin(box[:, 0], box[:, 1]) if x_max > image.shape[0]: x_max = image.shape[0] if x_min < 0: x_min = 0 if y_max > image.shape[1]: y_max = image.shape[1] if y_min < 0: y_min = 0 cv2.polylines(image, [box.astype(np.int32).reshape(-1, 1, 2)], True, color=(255, 255, 0), thickness=2) sub_img = image[y_min:y_max, x_min:x_max] txt = get_text(sub_img) if txt != '': output_text.append(get_text(sub_img)) return output_text return []
def run(self): while True: try: self.connect() time.sleep(6) # Splash Screen utils.swipe_left(self.driver, y_ratio=3 / 4) print('开始读要闻') while self.artical_counter: while self.artical_counter: item = f'/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.RelativeLayout/android.widget.LinearLayout/android.widget.FrameLayout[2]/android.support.v4.view.ViewPager/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.view.ViewGroup/android.widget.FrameLayout/android.widget.FrameLayout[2]/android.widget.LinearLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.ListView/android.widget.FrameLayout[1]/android.widget.LinearLayout' title = utils.get_text( self.driver, item + '/android.widget.TextView') if title and title not in self.articles: self.read_article(item, title) else: utils.swipe_up(self.driver, y_ratio=5 / 9) while not utils.click( self.driver, '//android.widget.FrameLayout[@content-desc="电视台"]/android.widget.RelativeLayout' ): time.sleep(1) print('开始看视频') while self.audio_counter: while self.audio_counter: item = f'/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.RelativeLayout/android.widget.LinearLayout/android.widget.FrameLayout[2]/android.support.v4.view.ViewPager/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.view.ViewGroup/android.widget.FrameLayout/android.widget.FrameLayout[2]/android.widget.LinearLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.ListView/android.widget.FrameLayout[1]/android.widget.LinearLayout' title = utils.get_text( self.driver, item + '/android.widget.LinearLayout/android.widget.TextView' ) or utils.get_text( self.driver, item + '/android.view.ViewGroup/android.widget.TextView') if title and title not in self.audio: self.play_audio(item, title) else: utils.swipe_up(self.driver, y_ratio=5 / 9) # self.subscribe() print('搞定啦~') break except KeyboardInterrupt: break except Exception as e: print('Exception:', e) finally: self.driver.quit()
def diff(self, archivo): lines=utils.get_text().splitlines() lineas=open(archivo).readlines() tablas1=self.obtenerTablas(lines) tablas2=self.obtenerTablas(lineas) print("va a hacer el diff") print("Analizando la Base de datos actual") for key in tablas1.keys(): print("\tAnalisis de Tabla "+key) if not tablas2.get(key): print("\t\tNo esta en la BD vieja") else: for campo in tablas1[key]: if not campo in tablas2[key]: print("\t\tFalta el campo {"+campo+"} en la BD Vieja") print("Analizando la Base de datos Vieja") for key in tablas2.keys(): print("\tAnalisis de Tabla "+key) if not tablas1.get(key): print("\t\tNo esta en la BD Nueva") else: for campo in tablas2[key]: if not campo in tablas1[key]: print("\t\tFalta el campo {"+campo+"} en la BD Nueva")
def play_video(self, item): title = utils.get_text( self.driver, '/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout[2]/android.widget.FrameLayout/android.widget.FrameLayout[2]/android.view.ViewGroup/android.support.v7.widget.RecyclerView/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout[2]/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.TextView' ) print(f'点开了视频《{title}》') duration = utils.get_text( self.driver, '/hierarchy/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.LinearLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout[2]/android.widget.FrameLayout/android.widget.FrameLayout[2]/android.view.ViewGroup/android.support.v7.widget.RecyclerView/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout[2]/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.FrameLayout/android.widget.LinearLayout[2]/android.widget.TextView' ) duration_text = duration.split(' / ')[1].split(':') duration = int(duration_text[0]) * 60 + int(duration_text[1]) print(f'这个视频要看:{duration}秒') utils.click(self.driver, item) self.wait(duration) self.audiuo_time -= duration
def run(self, edit): print("entro") window=sublime.active_window() view=window.active_view() text=utils.get_text() text=quitCurlyAlones(text) view.run_command("replace_all", {"text":text})
def on_query_completions(self, view, prefix, locations): if utils.get_language() != "java":return ultimo=utils.get_last_character() if ultimo=="." and utils.get_language()=="java": window=sublime.active_window() view=window.active_view() word=utils.get_word(-1) variables=Java().get_variables() tipo=word static=True if variables.get(word): tipo=variables[word] static=False package=re.findall("import ([\w.]+\.%s);"%tipo, utils.get_text()) if not package: posibleRuta=os.path.join(PATH_JSON, "java", "lang", tipo+".json") if os.path.exists(posibleRuta): package=["java.lang."+tipo] if package: package=package[0] clase=self.get_project_class(package) if clase: return utils.get_completion_list(clase["members"]) ruta=package.replace(".", os.sep)+".json" ruta=os.path.join(PATH_JSON, ruta) print("ya se determino") objeto=utils.load_json(ruta) miembros="clase" if static else "object" return utils.get_completion_list(objeto[miembros])
def _enumerate_workspaces(self): if not self.valid: sd_l.error( "The service document didn't pass the SWORD2 validation steps ('MUST' statements in spec). The workspaces and collections will not be enumerated." ) return if self.sd_uri: sd_l.info( "Enumerating workspaces and collections from the service document for %s" % self.sd_uri) # Reset the internally cached set self.workspaces = [] for workspace in self.service_dom.findall(NS['app'] % "workspace"): workspace_title = get_text(workspace, NS['atom'] % 'title') sd_l.debug("Found workspace '%s'" % workspace_title) collections = [] for collection_element in workspace.findall(NS['app'] % 'collection'): # app:collection + sword extensions c = SDCollection() c.load_from_etree(collection_element) collections.append(c) self.workspaces.append((workspace_title, collections)) # Add tuple
def parsing_item(tag): category = '' the_time = '' for h4_tag in tag.findAll('h4'): for children_of_h4_tag in h4_tag.contents: the_time = get_text(children_of_h4_tag) or the_time for strong_tag in h4_tag.findAll('strong'): for strong_tag_content in strong_tag.contents: category = get_text(strong_tag_content) or category for image_tag in tag.findAll('img'): for children_of_image_tag in image_tag.contents: if not children_of_image_tag.name and children_of_image_tag.strip( ): yield category, the_time, children_of_image_tag.strip()
def test_epoch_end(self, outputs): """ Called at the end of test to aggregate outputs, similar to `validation_epoch_end`. :param outputs: list of individual outputs of each test step """ images = [] captions = [] for output in outputs: captions.append(output["captions"]) images.append(output["images"]) captions = torch.cat(captions, dim=0) images = torch.cat(images, dim=0) captions = get_text(captions.cpu().numpy(), self.tokenizer) self.logger.experiment.add_image(captions[0], images[0]) # Write outputs to disk with open("outputs.txt", "w") as file: for caption in captions: file.write(caption) file.write("\n") # create output dict tqdm_dict = {} results = {} results["progress_bar"] = tqdm_dict results["log"] = tqdm_dict return results
def on_item(self, ch, method, header, body): """ Fires when we receive a new item to decode. """ # Lookup data in store, body should actually be an ObjectId item = self.userstream_store.find_one({"_id": ObjectId(body)}) if utils.item_a_direct_message(item) or utils.item_a_mention(item): text, screen_name = (utils.get_text(item), utils.get_screen_name(item)) print " [x] Received %r from %r" % (text, screen_name) # Any Spotify tracks? tracks = spotify.lookup_tracks(text) if len(tracks) > 0: # Save to playlist for track in tracks: id = self.playlist_store.save({'track':track, 'status':'new', 'source':'twitter', 'from':utils.get_sender(item)}) # Send each track to the broadcaster's 'receive' queue, so it can be broadcast # to all connected clients print " [x] Sending %r to broadcaster" % (track['track']['name'],) self.amqp_primary_channel.basic_publish(exchange='', routing_key=self.amqp_out_queue, body=str(id), properties=pika.BasicProperties( delivery_mode=2, # make message persistent )) # Confirm delivery ch.basic_ack(delivery_tag=method.delivery_tag)
def run(self, edit): text=utils.get_text() atributos=re.findall("private ([\w]+) ([\w]+)\s*;|private ([\w]+) ([\w]+)\s*=", text, flags=re.IGNORECASE) nombreClase=re.findall("public class ([\w]+)", text, flags=re.IGNORECASE) if not nombreClase or not atributos:return nombreClase=nombreClase[0] listAtributos=[] strAtributos="" strCabecera="" strConstructor="""\tpublic %(nombreClase)s(%(cabeceraConstructor)s){ %(atributos)s }""" for atributo in atributos: strCabecera+=atributo[0]+" "+atributo[1]+"," strAtributos+="\t\tthis."+atributo[1]+"="+atributo[1]+";\n" if strCabecera:strCabecera=strCabecera[:-1] if strAtributos:strAtributos=strAtributos[:-1] dConstructor={"atributos":strAtributos, "nombreClase":nombreClase, "cabeceraConstructor":strCabecera} strConstructorMaxivo=strConstructor%dConstructor window=sublime.active_window() view=window.active_view() view.insert(edit, view.line(view.sel()[0]).a, """\tpublic %(nombreClase)s(%(cabeceraConstructor)s){ this.%(atributo)s=%(atributo)s; }\n\n"""%{"nombreClase":nombreClase, "atributo":atributos[0][1], "cabeceraConstructor":atributos[0][0]+" "+atributos[0][1]}) view.insert(edit, view.line(view.sel()[0]).a, strConstructorMaxivo)
def on_data(self, data): if not data.strip(): return True print " [x] Got:", data # Decode JSON data item = json.loads(data) # Save data id = self.store.save(item) # Is this item a direct message? a_direct_message = utils.item_a_direct_message(item) # Is this item a mention? a_mention = utils.item_a_mention(item) # Continue processing further down the chain if a_direct_message or a_mention: print " [x] Received", utils.get_screen_name(item), ":", utils.get_text(item) self.channel.basic_publish(exchange='', routing_key=self.amqp_queue, body=str(id), properties=pika.BasicProperties( delivery_mode=2, # make message persistent )) return True
def text_from_audio(): fin = wave.open('audios/audio.wav') text = get_text(fin) if request.method == 'POST': text = request.form['transcript'] return show_form(text) return render_template('transcript.html', message=text)
def validate(self): valid = True if not self.parsed: return False # The SWORD server MUST specify the sword:version element with a value of 2.0 # -- MUST have sword:version element # -- MUST have value of '2.0' self.version = get_text(self.service_dom, NS['sword'] % "version") if self.version: if self.version != "2.0": # Not a SWORD2 server... # Fail here? sd_l.error( "The service document states that the server's endpoint is not SWORD 2.0 - stated version:%s" % self.version) valid = False else: sd_l.error("The service document did not have a sword:version") valid = False # The SWORD server MAY specify the sword:maxUploadSize (in kB) of content that can be uploaded in one request [SWORD003] as a child of the app:service element. If provided this MUST contain an integer. maxupload = get_text(self.service_dom, NS['sword'] % "maxUploadSize") if maxupload: try: self.maxUploadSize = int(maxupload) except ValueError: # Unparsable as an integer. Enough to fail a validation? # Strictly... yep sd_l.error( "The service document did not have maximum upload size parseable as an integer." ) valid = False # Check for the first workspace for a collection element, just to make sure there is something there. test_workspace = self.service_dom.find(NS['app'] % "workspace") if test_workspace != None: sd_l.debug( "At least one app:workspace found, with at least one app:collection within it." ) else: valid = False sd_l.error( "Could not find a app:workspace element in the service document." ) return valid
def run(self): s=socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.bind(("", 8888)) s.listen(15) while True: con, addr=s.accept() con.sendall(bytes(utils.get_text(), 'UTF-8')) s.close()
def guardar(self, nombre): if not nombre:return ext=utils.get_fileext() if not ext:ext=utils.get_ext() nombre=nombre.replace(" ", "_") text=utils.get_text() text=text.replace("$", "\$"); utils.file_write(TEMPLATES_PATH+nombre+"."+ext, text)
def load_from_etree(self, collection): """ Parse an `etree.SubElement` into attributes in this object. Also, caches the most recently used DOM object it is passed in `self.dom` """ self._reset() self.dom = collection self.title = get_text(collection, NS['atom'] % 'title') # MUST have href attribute self.href = collection.attrib.get('href', None) # Accept and Accept multipart for accept in collection.findall(NS['app'] % 'accept'): if accept.attrib.get("alternate", None) == "multipart-related": self.accept_multipart.append(accept.text) else: self.accept.append(accept.text) # Categories for category_element in collection.findall(NS['atom'] % 'category'): self.categories.append(Category(dom=category_element)) # SWORD extensions: self.collectionPolicy = get_text(collection, NS['sword'] % 'collectionPolicy') # Mediation: True/False mediation = get_text(collection, NS['sword'] % 'mediation') self.mediation = mediation.lower() == "true" self.treatment = get_text(collection, NS['sword'] % 'treatment') self.description = get_text(collection, NS['dcterms'] % 'abstract') self.service = get_text(collection, NS['sword'] % 'service', plural = True) self.acceptPackaging = get_text(collection, NS['sword'] % 'acceptPackaging', plural = True) # Log collection details: coll_l.debug(str(self))
def run(self, edit): text=utils.get_text() lineas=text.splitlines() lineas=list(set(lineas)) text="" for linea in lineas:text+=linea+"\n" utils.set_text(text)
def parsing_td_time(items): the_time = '' for td_item in items: for div_item in td_item.find_all('div', {'class': 'airSchedule-time'}): time_tag = div_item.find('strong') if time_tag: for child in time_tag.contents: the_time = get_text(child) return the_time
def process(self, msg): data = { 'text': u.get_text(msg), 'user_id': self.user_id } reply = self.get_reply(data) self.send_reply(reply)
def crawl_script_of(title): print("crawling:", title) url = "https://imsdb.com/scripts/" + title + ".html" soup = get_soup(url) crawled = soup.select("pre")[0] if title in ["Shawshank-Redemption,-The", "Toy-Story"]: crawled = crawled.select("pre")[0] former = "line" char_elem = None lines = [] for c in crawled: if type(c) is Tag and former == "line" and re.match( "^\n{1,}$", get_text(c)) is None: char_elem = get_text(c) former = "char" print("char:", char_elem) elif type(c) is NavigableString and former == "char": line_elem = get_text_plain(c) former = "line" print("line:", line_elem) print("---") if len(char_elem) > 1 and len(line_elem) > 1: if line_elem[0] != '\n': lines.append([char_elem, line_elem]) else: print("initial:", former) print(get_text(c) if type(c) is Tag else get_text_plain(c)) former = "line" f = open("/mnt/UniversalUse/data/scripts_for_test/d/" + title + ".csv", "wt") for l in lines: l[0] = remove_brackets(l[0]) l[1] = remove_brackets(l[1]) if l[0] != '""' and l[1] != '""': print(l[0], "|||", l[1]) f.write(l[0] + "," + l[1] + "\n") f.close()
def behave(button, bot, update): if 'menu' in button: generic_menu(button['menu'], update) elif 'action' in button: action_manager(bot, update, button['action']) elif 'text_file' in button: log_message( update.effective_message.reply_text( utils.get_text(button['text_file']))) elif 'text' in button: log_message(update.effective_message.reply_text(button['text']))
def get_info(url, soup=None): if soup is None: html = read_html(url) soup = Soup(html) info = {} info['artist'] = soup.find('span', {'itemprop': 'author'}).text.strip() info['title'] = soup.find('span', {'itemprop': 'name'}).text.strip() sss = get_sss(soup) info['novel_ex'] = get_text(sss[-2], '') return info
def create_data(root): json_files, txt_files = get_files(root) keys = [os.path.splitext(os.path.basename(f))[0] for f in json_files] data = {} for key, json_fn, txt_fn in zip(keys, json_files, txt_files): text = get_text(txt_fn) with open(json_fn, "r", encoding="utf-8") as fp: json_info = json.load(fp) text_space = regex.sub(r"[\t\n]", " ", text).upper() text_class = np.zeros(len(text), dtype=int) for field in json_info.keys(): i = field_val[field] v = json_info[field] if field == "total": anchor = [i.start() for i in re.finditer('TOTAL', text_space)] + \ [i.start() for i in re.finditer('GROSS', text_space)] + \ [i.start() for i in re.finditer('AMOUNT', text_space)] points = [i.start() for i in re.finditer(v, text_space)] pos, dist = -1, 99999 for p1 in points: for p2 in anchor: if p1 - p2 < 0: continue if '\n' in text[p2: p1]: continue if dist > p1 - p2: dist = p1 - p2 pos = p1 if pos == -1: pos = text_space.find(v) else: pos = text_space.find(v) if pos == -1: s = None e = 0 while s is None and e < 3: e += 1 s = regex.search("(" + v + "){e<=" + str(e) + "}", text_space) if s is not None: v = s[0] pos = text_space.find(v) text_class[pos: pos + len(v)] = i data[key] = (text, text_class) return data
def validate(self): valid = True if not self.parsed: return False # The SWORD server MUST specify the sword:version element with a value of 2.0 # -- MUST have sword:version element # -- MUST have value of '2.0' self.version = get_text(self.service_dom, NS['sword'] % "version") if self.version: if self.version != "2.0": # Not a SWORD2 server... # Fail here? sd_l.error("The service document states that the server's endpoint is not SWORD 2.0 - stated version:%s" % self.version) valid = False else: sd_l.error("The service document did not have a sword:version") valid = False # The SWORD server MAY specify the sword:maxUploadSize (in kB) of content that can be uploaded in one request [SWORD003] as a child of the app:service element. If provided this MUST contain an integer. maxupload = get_text(self.service_dom, NS['sword'] % "maxUploadSize") if maxupload: try: self.maxUploadSize = int(maxupload) except ValueError: # Unparsable as an integer. Enough to fail a validation? # Strictly... yep sd_l.error("The service document did not have maximum upload size parseable as an integer.") valid = False # Check for the first workspace for a collection element, just to make sure there is something there. test_workspace = self.service_dom.find(NS['app'] % "workspace") if test_workspace != None: sd_l.debug("At least one app:workspace found, with at least one app:collection within it.") else: valid = False sd_l.error("Could not find a app:workspace element in the service document.") return valid
def run_predict_txt(model): root = 'test_data' for fn in sorted(os.listdir(root)): try: if not fn.endswith('txt'): continue fn = os.path.join(root, fn) text = get_text(fn) predict(args.device, model, [fn], [text]) except Exception as e: print(str(e)) print(fn)
def get_bleu(model, test_iterator, TRG, transformer): original_text = [] generated_text = [] with torch.no_grad(): for i, batch in tqdm.tqdm(enumerate(test_iterator)): src = batch.src trg = batch.trg if transformer: src = src.permute(1, 0) trg = trg.permute(1, 0) try: output, _ = model(src, trg) output = output.permute(1, 0, 2) except IndexError as e: logger.warning("get bleu index error {e}", e=e) break else: output = model(src, trg, 0) # turn off teacher forcing # trg = [trg sent len, batch size] # output = [trg sent len, batch size, output dim] if transformer: output = output.argmax(dim=-1).permute(1, 0) else: output = output.argmax(dim=-1) original_text.extend( [get_text(x, TRG.vocab) for x in trg.cpu().numpy().T]) generated_text.extend([ get_text(x, TRG.vocab) for x in output[1:].detach().cpu().numpy().T ]) print( corpus_bleu([[text] for text in original_text], generated_text) * 100)
def run(self, edit, **args): if not args.get("nombre"):return nombre=args.get("nombre") for c in os.listdir(TEMPLATES_PATH): # print(c) if nombre.lower()==c.lower()[:c.rfind(".")]: texto=utils.file_read(TEMPLATES_PATH+"/"+c) self.texto=texto if not utils.get_text().strip(): self.insertar() else: # print("no tiene texto") self.texto=texto window=sublime.active_window() window.show_input_panel("", c[c.rfind("."):], self.crear_archivo, None, None)
def test_get_text(): inputs = [ "tag", "message", ] def mock_input(s): return inputs.pop(0) utils.input = mock_input a, b = utils.get_text() if not a == 'tag': raise AssertionError() if not b == 'message': raise AssertionError()
def get_data(i, num=0): """ Creates a list of (num) clean HTML news from page (i) (Stock symbol, news text and data) :param i: page number :param num: number of wanted news from page (i) :return: list of (Stock symbol, news text and data) """ containers = utils.get_news(i) news_list = list() for i in range(num + 1): s = utils.get_text(containers[i]) if s == "not a stock news": continue news_list.append(s) return news_list
def haikus_for_document(filename): """ Analyzes a document for haikus. Returns a list of tuples. """ text = get_text(filename) haikus = [] # SpaCy has a maximum text size of 1,000,000 characters. # Let's use one fewer to be on the safe side. for chunk in chunks( text, 999_999): # this underscore syntax was introduced in Python 3.6 doc = nlp(chunk) for sent in doc.sents: haiku = check_haiku(sent) if haiku: haikus.append(haiku)
def text(): if request.method == 'POST': # saving the recived pdf file to upload folder f = request.files['file'] f.save( os.path.join(app.config['UPLOAD_FOLDER'], secure_filename(f.filename))) # making pathlib 'Path' object to send to helper function path = Path(os.getcwd() + "/app/api_uploaded_files/" + f.filename) if (not path.is_file()) or (path.suffix != ".pdf"): return jsonify(error=400, text="Invalid Request"), 400 text = get_text(path) return jsonify(pdf_name=secure_filename(f.filename), text=text) return render_template('index.html')
def run(self): jsonDiff=utils.load_json(DIFF_JSON) window=sublime.active_window() view=window.active_view() text=utils.get_text() filename=view.file_name() rutaCarpeta=get_folder(filename) utils.create_folder_if_not_exist(rutaCarpeta) nombreArchivo=time.strftime("%Y%m%d%H%M%S") lista=os.listdir(rutaCarpeta) escribir=True if lista: ultimo=max(lista) if filecmp.cmp(rutaCarpeta+os.sep+ultimo, filename):escribir=False if escribir: print("guardando version...") rutaArchivo=rutaCarpeta+os.sep+nombreArchivo shutil.copyfile(filename, rutaArchivo) jsonDiff[filename]=nombreArchivo utils.save_json(DIFF_JSON, jsonDiff)
def _enumerate_workspaces(self): if not self.valid: sd_l.error("The service document didn't pass the SWORD2 validation steps ('MUST' statements in spec). The workspaces and collections will not be enumerated.") return if self.sd_uri: sd_l.info("Enumerating workspaces and collections from the service document for %s" % self.sd_uri) # Reset the internally cached set self.workspaces = [] for workspace in self.service_dom.findall(NS['app'] % "workspace"): workspace_title = get_text(workspace, NS['atom'] % 'title') sd_l.debug("Found workspace '%s'" % workspace_title) collections = [] for collection_element in workspace.findall(NS['app'] % 'collection'): # app:collection + sword extensions c = SDCollection() c.load_from_etree(collection_element) collections.append(c) self.workspaces.append( (workspace_title, collections) ) # Add tuple
def movie(self): try: director = utils.get_text(self._tree, s.XPATH_DIRECTOR) genre = utils.get_text(self._tree, s.XPATH_GENRE) length = utils.get_text(self._tree, s.XPATH_LENGTH) except IndexError: try: director = utils.get_text(self._tree, s.XPATH_DIRECTOR_2) genre = utils.get_text(self._tree, s.XPATH_GENRE_2) length = utils.get_text(self._tree, s.XPATH_LENGTH_2) except IndexError: raise ScraperException() name_desc = utils.get_text(self._tree, s.XPATH_NAME_DESC).splitlines() cleaned_name_desc = utils.clean_name_and_desc(name_desc) name = cleaned_name_desc[0] desc = ''.join(cleaned_name_desc[1:]) return Movie(name=name, desc=desc, director=director, genre=genre, length=length)
def __init__(self): text=utils.get_text() declaraciones=re.findall("\<([A-Z][a-z]*)\>", text) text=re.sub("\<([A-Z][a-z]*)\>", "", text) text=text.replace("< ", "").replace(" >", "").replace("<=", "").replace(">=", "") #text=self.clean(text) text=text.replace("abstract ", " ").replace("final ", " ").replace("public ", "").replace("private ", "").replace("protected ", "").replace("synchronized ", "").replace("volatile ", "").replace("class ", "public class").replace("static ", "").replace("< ", "").replace("> ", "").replace("<=", "").replace(">=", "") tipos=[] variables={} #print("jojo") #print("el texto es : "+text) #print(re.findall("\n[\t ]*([A-Z][\w]*[ ]+[\w_]+)", text)) declaraciones+=re.findall("\n[\t ]*([A-Z][\w]*[ ]+[\w_]+)", text) + re.findall("[^\w]([A-Z][\w]*)\.", text) + re.findall("\(\s*([A-Z][\w]*[ ]+[\w_]+)", text) + re.findall(",\s*([A-Z][\w]*[ ]+[\w_]+)", text) + re.findall("\n[ \t]*@([A-Z][\w]*)", text) + re.findall("new ([A-Z][\w]*)", text) + re.findall("implements ([A-Z][\w]*)", text) + re.findall("extends ([A-Z][\w]*)", text) + re.findall("throws ([A-Z][\w]*)", text) +re.findall("\<([A-Z][\w]*)\>", text) #print(declaraciones) #text=self.clean(text) this=re.findall("extends ([A-Z][\w]*)", text) if this:declaraciones+=[this[0]+" this"] #print(declaraciones) for declaracion in declaraciones: declaracion=declaracion.strip() pos=declaracion.find(" ") if pos!=-1: tipo=declaracion[:pos] variable=declaracion[pos+1:] variables[variable]=tipo tipos.append(tipo) else:tipos.append(declaracion) newtipos=[] tipos=list(set(tipos)) for tipo in tipos: if re.findall("import [\w.]+\.%s;"%(tipo), text): #print("ya esta exportado******"+tipo) continue newtipos.append(tipo) tipos=newtipos self.tipos=tipos self.variables=variables
def load_from_etree(self, collection): """ Parse an `etree.SubElement` into attributes in this object. Also, caches the most recently used DOM object it is passed in `self.dom` """ self._reset() self.dom = collection self.title = get_text(collection, NS['atom'] % 'title') # MUST have href attribute self.href = collection.attrib.get('href', None) # Accept and Accept multipart for accept in collection.findall(NS['app'] % 'accept'): if accept.attrib.get("alternate", None) == "multipart-related": self.accept_multipart.append(accept.text) else: self.accept.append(accept.text) # Categories for category_element in collection.findall(NS['atom'] % 'category'): self.categories.append(Category(dom=category_element)) # SWORD extensions: self.collectionPolicy = get_text(collection, NS['sword'] % 'collectionPolicy') # Mediation: True/False mediation = get_text(collection, NS['sword'] % 'mediation') self.mediation = mediation.lower() == "true" self.treatment = get_text(collection, NS['sword'] % 'treatment') self.description = get_text(collection, NS['dcterms'] % 'abstract') self.service = get_text(collection, NS['sword'] % 'service', plural=True) self.acceptPackaging = get_text(collection, NS['sword'] % 'acceptPackaging', plural=True) # Log collection details: coll_l.debug(unicode(self))
def on_pre_save(self, view): lang=utils.get_language() if lang!="javascript" and lang!="nodejs":return text=utils.get_text() text=re.sub("\$\([\"'.\w#-]*\)", "jQuery", text) functions=re.findall("([$A-Za-z]+)\.([\w]+)\(", text) jsonPath=sublime.packages_path()+os.sep+"javascript"+os.sep+"functions.json" if lang=="nodejs":jsonPath=sublime.packages_path()+os.sep+"javascript"+os.sep+"functions_node.json" d=utils.load_json(jsonPath) for function in functions: key=function[0] if key=="$scope":continue value=function[1]+"()" if not d.get(key):d[key]=[] if not value in d[key]: d[key].append(value) utils.save_json(jsonPath, d)
def on_chat_message(self, msg): self.save_message(msg, skip_reply=True) self.stage().on_chat_message(msg, u.get_text(msg))
def seleccionarVista(self, index): if index==-1:return archivo=open(self.vistas[index]) texto=texto=archivo.read() self.text=utils.get_text() archivo.close() nombreClase=re.findall("public\s+class\s+([\w]+)", self.text, flags=re.IGNORECASE) claseHereda=re.findall("public\s+class\s+[\w]+\s+extends\s+([\w]+)", self.text, flags=re.IGNORECASE) nombreClase=nombreClase[0] if claseHereda: claseHereda=claseHereda[0] archivos=utils.get_files({"match":claseHereda+".java", "ignores":["target", "build", ".svn", ".git", "bin"]}) print("los archivos encontrados son : ") print(archivos) if archivos:self.text+=open(archivos[0]).read() print("el nombre de la clase es : "+nombreClase) #{"ext":"java", "ignores":["target", "build", ".svn", ".git", "bin"]} reg_listener='listener=\s*"#\{%s\.([\w]+)\}"'%nombreClase reg_actionListener='actionListener=\s*"#\{%s\.([\w]+)\}"'%nombreClase reg_action='action=\s*"#\{%s\.([\w]+)\}"'%nombreClase reg_complete_method='completeMethod=\s*"#\{%s\.([\w]+)\}"'%nombreClase metodos=re.findall(reg_listener, texto, flags=re.IGNORECASE) metodos+=re.findall(reg_actionListener, texto, flags=re.IGNORECASE) metodos+=re.findall(reg_complete_method, texto, flags=re.IGNORECASE) metodos+=re.findall(reg_action, texto, flags=re.IGNORECASE) texto=re.sub(reg_listener, "", texto,flags=re.IGNORECASE) texto=re.sub(reg_actionListener, "", texto,flags=re.IGNORECASE) texto=re.sub(reg_action, "", texto,flags=re.IGNORECASE) texto=re.sub(reg_complete_method, "", texto,flags=re.IGNORECASE) atributos=re.findall("#\{%s\.([\w]+)\}"%nombreClase, texto, flags=re.IGNORECASE) atributos=list(set(atributos)) metodos=list(set(metodos)) self.generado="" print(atributos) print(metodos) self.listAtributos=[] self.listMetodos=[] self.total=0 self.i=0 for atributo in atributos: if self.text.find("get"+atributo[0].upper()+atributo[1:]+"(")==-1: self.listAtributos.append([atributo]) self.total+=1 for metodo in metodos: if self.text.find(metodo)==-1: self.listMetodos.append(metodo) print(self.listAtributos) print(self.listMetodos) if not self.listAtributos and self.listMetodos: self.llenar() return if not self.listAtributos and not self.listMetodos:return self.pedir()
def get_all_page_data(url, is_community=False): name = url.split("/")[-1] if len( url.split("/")[-1]) > 0 else url.split("/")[-2] if is_community: name = os.path.join(name, "community") url = url + "/community" data_path = os.path.join(".", "data") if not os.path.exists(data_path): os.mkdir(data_path) page_data_path = os.path.join(data_path, name) if not os.path.exists(page_data_path): os.mkdir(page_data_path) should_scrape_headless = is_community == False driver = initialize_driver(args.chrome, args.windows, is_headless=should_scrape_headless) driver.get(url) page_name = get_text(driver, './/a[@class="_64-f"]') print(f"Scrolling {url} until {cutoff_date}") scroll(driver, pd.to_datetime(cutoff_date)) posts = driver.find_elements_by_xpath( '//div[contains(@class, "userContentWrapper")]') post_links = [get_post_links(post) for post in tqdm(posts)] post_links = list(set(post_links)) with open(os.path.join(page_data_path, 'post_links.json'), 'w') as f: json.dump(post_links, f) driver.quit() print(f"Now scraping {len(post_links)} posts from {name}") for i, post_link in enumerate(post_links): if not is_string_url(post_link): continue print(f"Scraping {post_link}") driver = initialize_driver(args.chrome, args.windows) driver.get(post_link) if "/videos/" in post_link: post_type = "videos" elif "/photos/" in post_link: post_type = "photos" elif "/posts/" in post_link: post_type = "posts" elif "/notes/" in post_link: post_type = "notes" else: post_type = "other" if post_type == "notes": post_element = driver.find_element_by_xpath( './/div[contains(@class, "fb_content")]') else: post_element = driver.find_element_by_xpath( './/div[contains(@class, "userContentWrapper")]') post_data = get_post_data(driver, post_element, post_type) post_data["page_name"] = page_name with open(os.path.join(page_data_path, f'page_post_{i}.json'), 'w') as f: json.dump(post_data, f) driver.quit() if not is_community: get_all_page_data(url, is_community=True)
def run(self, edit): texto="" for linea in reversed(utils.get_text().splitlines()) : texto+=linea+"\n" utils.set_text(texto)
#!/usr/bin/env python3 from utils import get_text input = get_text('13') # input = [ # '939', # # # '7,13,x,x,59,x,31,19' # # '17,x,13,19' # # # '67,7,x,59,61' # '1789,37,47,1889' # ] time = int(input[0]) original = input[1].split(',') buses = list(map(lambda x: int(x), filter(lambda x: x != 'x', original))) arrivals = [] for bus in buses: prev = time // bus arrivals.append({ 'id': bus, 'arrival': bus * (prev + 1) }) earliest = next(filter(lambda x: x['arrival'] > time, sorted(arrivals, key=lambda k: k['arrival']))) print(earliest['id'] * (earliest['arrival'] - time)) def closure(idx, num, sign): def f(x): # print(f"x = {x} idx = {idx * sign} num = {num}") return (x + (idx * sign)) % num == 0
def run(self, edit): window=sublime.active_window() view=window.active_view() self.lang=utils.get_language() self.regMetodos={ "python":"def\\s+%(nombre)s\\(", "python3":"def\\s+%(nombre)s\\(", "ruby":"def\\s+%(nombre)s", "java":"[\w].+\s+%(nombre)s\(", "javascript":"function\\s*%(nombre)s\\(|%(nombre)s\\s*=\\s*function\\(", "nodejs":"function\\s*%(nombre)s\\(|%(nombre)s\\s*=\\s*function\\(", "c":"\\b%(nombre)s\\([^)]*\\)\\s*\\n?\\s*\\{", "c#":"\\b%(nombre)s\\([^)]*\\)\\s*\\n?\\s*\\{", "c++":"\\b%(nombre)s\\([^)]*\\)\\s*\\n?\\s*\\{" } self.regVariables={ "python":"\\b%(nombre)s\\s*=[^=]?|\\b%(nombre)s\\s+in\\s+|def [\\w_]+\\(.*\\b%(nombre)s", "python3":"\\b%(nombre)s\\s*=[^=]?|\\b%(nombre)s\\s+in\\s+|def [\\w_]+\\(.*\\b%(nombre)s", "ruby":"\\b%(nombre)s\\s*=[^=]?|\\b%(nombre)s\\s+in\\s+|def [\\w_]+\\(.*\\b%(nombre)s", "java":"\\b%(nombre)s\\s*=[^=]?|[\\w]+\\s+%(nombre)s;|[\\w]+\\s+%(nombre)s,", "javascript":"\\b%(nombre)s\\s*=[^=]?|var+\\s+%(nombre)s;|var+\\s+%(nombre)s,", "nodejs":"\\b%(nombre)s\\s*=[^=]?|var+\\s+%(nombre)s;|var+\\s+%(nombre)s,", "c":"\\b%(nombre)s\\s*=[^=]?|[\\w]+\\s+%(nombre)s;|[\\w]+\\s+%(nombre)s,", "c#":"\\b%(nombre)s\\s*=[^=]?|[\\w]+\\s+%(nombre)s;|[\\w]+\\s+%(nombre)s,", "c++":"\\b%(nombre)s\\s*=[^=]?|[\\w]+\\s+%(nombre)s;|[\\w]+\\s+%(nombre)s,", "jsf":'id\\s*=\\s*"%(nombre)s"' } self.comentarios={ "python":'#[^\\n]\\n|"""[^"]"""', "python3":'#[^\\n]\\n|"""[^"]"""', "ruby":'#[^\\n]\\n|"""[^"]"""', "java":"//[^\\n]\\n|/[*][^/]*[*]/", "javascript":"//[^\\n]\\n|/[*][^/]*[*]/", "nodejs":"//[^\\n]\\n|/[*][^/]*[*]/", "c":"//[^\\n]\\n|/[*][^/]*[*]/", "c#":"//[^\\n]\\n|/[*][^/]*[*]/", "c++":"//[^\\n]\\n|/[*][^/]*[*]/", "jsf":"<!--[^-]->" } var=utils.get_word_signature() print(var) isMethod=utils.is_method() isUnique=var.find(".")==-1 if self.lang=="python" and var.startswith("self."): isUnique=True var=var[var.find(".")+1:] elif self.lang=="java" and var.startswith("this."): isUnique=True var=var[var.find(".")+1:] if isMethod: if isUnique:self.goto_method(var) else:self.goto_class_method(var[:var.find(".")], var[var.find(".")+1:]) else: if isUnique: self.goto_definition(var) paquete=re.findall("import\s+([\w._]+\."+var+");", utils.get_text(), flags=re.IGNORECASE) if var[0].isupper() and paquete: print("va hacia la clase") self.goto_class(paquete[0]) else: self.goto_class_definition(var[:var.find(".")], var[var.find(".")+1:]) print("no unico")
def process_file(filename, output_path=None, lang='sk', verbose=True): xmldoc = ET.parse(filename) root = xmldoc.getroot() organizacnaJednotka = root.find('organizacnaJednotka').text ilisty = root.find('informacneListy') if verbose: print " Nasiel som %d informacnych listov." % len(ilisty.findall('informacnyList')) # elementy, ktore sa budu parsovat z XML-ka # kluc => XPath (kluc sa pouziva neskor v template) elements = {'kod': 'kod', 'nazov': 'nazov', 'kredit': 'kredit', 'sposobUkoncenia': 'sposobUkoncenia', 'studijnyProgram': 'studijneProgramy/studijnyProgram/popis', 'datumSchvalenia': 'datumSchvalenia', 'obsahovaNapln': '_ON_/texty', 'vahaHodnotenia': '_VH_/texty', 'garanti': 'garanti/garant/plneMeno'} data = [] # spracovanie informacnych listov jednotlivych predmetov for il in ilisty.findall('informacnyList'): # preskocime predmety, ktore nie su statne skusky if il.find('_ON_') is None: continue d = {'lang' : lang, 'organizacnaJednotka': organizacnaJednotka} for key, path in elements.iteritems(): if il.find(path) is not None: if path.startswith('_'): d[key] = utils.get_text(il.find(path)) elif key == 'studijnyProgram': d[key] = [el.text for el in il.findall(path)] else: d[key] = il.find(path).text else: d[key] = '' # uprava kodov predmetov d['kod'] = utils.parse_code(d['kod']) data.append(d) # nacitanie HTML sablony script_abs_path = os.path.dirname(os.path.abspath(__file__)) tpl_path = os.path.join(script_abs_path, 'templates') env = Environment(loader=FileSystemLoader(tpl_path)) tpl_name = 'template_statne-skusky_table_%s.html' % lang html_tpl = env.get_template(tpl_name) # zapis do suborov for course in data: kod_predmetu = course['kod'] html = html_tpl.render(course) filename = '%s.html' % kod_predmetu if output_path is not None: path = os.path.join(output_path, filename) if not os.path.exists(output_path): os.mkdir(output_path) else: path = filename with open(path, 'w') as f: f.write(html.encode('utf8'))
counts = {} for char in s: if char in counts: counts[char] += 1 else: counts[char] = 1 return counts if __name__ == '__main__': # Look for least common characters. counts = count_characters(utils.get_text(2)) print counts counts_sorted = sorted(counts, key=counts.get) print counts_sorted print ''.join(counts_sorted[0:8]) # Try to shift least common characters. import challenge_1 s = ''.join(counts_sorted[0:8]) s = challenge_1.shift(s,2) print s.lower() # aeilquty -> equality
#!/usr/bin/env python3 from utils import get_text from pprint import pprint input = get_text('11') # input = [ # "L.LL.LL.LL", # "LLLLLLL.LL", # "L.L.L..L..", # "LLLL.LL.LL", # "L.LL.LL.LL", # "L.LLLLL.LL", # "..L.L.....", # "LLLLLLLLLL", # "L.LLLLLL.L", # "L.LLLLL.LL", # ] def count_occurences(src, y, x, n): result = 0 height = len(src) width = len(src[0]) miny = y - 1 if y - 1 >= 0 else 0 maxy = y + 1 if y + 1 < height else height - 1 minx = x - 1 if x - 1 >= 0 else 0 maxx = x + 1 if x + 1 < width else width - 1
for command in commands: direction, distance = command.split(' ') if direction == 'forward': position[0] += int(distance) elif direction == 'up': position[1] -= int(distance) else: position[1] += int(distance) return position[0] * position[1] def part_two(commands): horizontal, vertical, aim = [0, 0, 0] for command in commands: direction, units = command.split(' ') if direction == 'forward': horizontal += int(units) vertical += (aim * int(units)) elif direction == 'up': aim -= int(units) else: aim += int(units) return horizontal * vertical data = get_text('02') print(part_one(data)) print(part_two(data))
def save(self, name): if name==None:return print("antes de : "+self.rutaSamples) samples=utils.load_json(self.rutaSamples) samples[name]=utils.get_text() utils.save_json(self.rutaSamples, samples)
import nltk import pandas as pd import string from argparse import ArgumentParser from nltk.sentiment.vader import SentimentIntensityAnalyzer from utils import open_csvs, read_csvs, get_text parser = ArgumentParser() parser.add_argument('-f', dest='data_path') parser.add_argument('-o', dest='out_file') args = parser.parse_args() if __name__ == '__main__': print("Reading data") data = read_csvs(args.data_path) documents = get_text(data) sid = SentimentIntensityAnalyzer() with open(args.out_file, 'w') as f: for x in messages: ss = sid.polarity_scores(x) out_line = x + ',' + str(ss['compound']) + '\n'
#!/usr/bin/env python3 from utils import get_text input = get_text('02') store = [] for line in input: r, t, p = line.split(' ') mn, mx = map(int, r.split('-')) term = t.rstrip(':') store.append({'min': mn, 'max': mx, 'term': term, 'password': p}) result = 0 for entry in store: occurences = entry['password'].count(entry['term']) if entry['min'] <= occurences <= entry['max']: result += 1 print(result) result = 0 for entry in store: first = entry['password'][entry['min'] - 1] == entry['term'] second = entry['password'][entry['max'] - 1] == entry['term'] if first ^ second: result += 1 print(result)
def validate(self): valid = True if not self.parsed: return False # The SWORD server MUST specify the sword:version element with a value of 2.0 # -- MUST have sword:version element # -- MUST have value of '2.0' self.version = get_text(self.service_dom, NS["sword"] % "version") if self.version: if self.version != "2.0": # Not a SWORD2 server... # Fail here? sd_l.error( "The service document states that the server's endpoint is not SWORD 2.0 - stated version:%s" % self.version ) valid = False else: sd_l.error("The service document did not have a sword:version") valid = False # The SWORD server MAY specify the sword:maxUploadSize (in kB) of content that can be uploaded in one request [SWORD003] as a child of the app:service element. If provided this MUST contain an integer. maxupload = get_text(self.service_dom, NS["sword"] % "maxUploadSize") if maxupload: try: self.maxUploadSize = int(maxupload) except ValueError: # Unparsable as an integer. Enough to fail a validation? # Strictly... yep sd_l.error("The service document did not have maximum upload size parseable as an integer.") valid = False # Check for the first workspace for a collection element, just to make sure there is something there. test_workspace = self.service_dom.find(NS["app"] % "workspace") if test_workspace != None: sd_l.debug("At least one app:workspace found, with at least one app:collection within it.") else: valid = False sd_l.error("Could not find a app:workspace element in the service document.") # The SWORD server MUST specify the app:accept element for the app:collection element. # If the Collection can take any format content type, it should specify */* as its # value [AtomPub]. It MUST also specify an app:accept element with an alternate attribute # set to multipart-related as required by [AtomMultipart]. The formats specified by # app:accept and app:accept@alternate="multipart-related" are RECOMMENDED to be the same. workspaces = self.service_dom.findall(NS["app"] % "workspace") if workspaces is not None: for workspace in workspaces: cols = workspace.findall(NS["app"] % "collection") for col in cols: # the collection may contain a sub-service document, which means it is not # beholden to the rules above service = col.find(NS["sword"] % "service") if service is not None: continue # since we have no sub-service document, we must validate accept_valid = True multipart_accept_valid = True accepts = col.findall(NS["app"] % "accept") for accept in accepts: multipart = accept.get("alternate") if multipart is not None: if multipart != "multipart-related": multipart_accept_valid = False sd_l.debug("Multipart accept alternate is incorrect: " + str(multipart)) else: # FIXME: we could test to see if the content is viable, but probably that's pointless pass if not multipart_accept_valid or not accept_valid: sd_l.debug("Either the multipart accept or the accept fields were invalid (see above debug)") valid = False return valid
def get_abv(self): ''' Attempts to find percentage of alcohol by volume using Bing ''' abv = '' found_abv = '' ''' A ceiling for ABV content for validation We can assume BevMo does not offer kegs with this high of an ABV ''' max_abv = 20.0 if not self.parsed: self.parse() search_url = 'https://www.bing.com/search?q={0}+alcohol+content\ '.format('+'.join(self.name.split())) search_links = get_html(search_url).xpath('//a/@href') new_search_links = search_links[search_links.index('javascript:'):][1:] results = [x for x in new_search_links if x != '#' and 'site:' not in x] ''' Max number of links to search for alcohol by volume (ABV) ''' num_attempts = self.num_attempts ''' Filter links with same domain to improve chances of matching ''' searched_domains = set() ''' Add the top page results that are unique, r_it is an iterator ''' top_results = [] r_it = 0 result_link = '' while len(top_results) < num_attempts and r_it < len(results): result_link = results[r_it] domain = '{url.netloc}'.format(url=urlparse(result_link)) if '.' in domain: if domain.count('.') > 1: domain = domain.split('.')[1] else: domain = domain.split('.')[0] ''' Avoid already searched domains ''' if domain in searched_domains: r_it += 1 else: top_results.append(result_link) r_it += 1 searched_domains.add(domain) for i in xrange(min(num_attempts, len(top_results))): if self.verbose: print('Searching {}'.format(top_results[i])) try: search_text = ''.join(get_text(get_html(top_results[i]))) except Exception: continue ''' Retrieves partial string containing the words ABV and a % ''' abv = re.search('(?<=[Aa][Bb][Vv])[^\d]*(\d+[.]?\d*)(?=%)|(?<=%)\ [^\d]*(\d+[.]?\d*)[^\d]*\ (?=[Aa][Bb][Cc])', search_text) if abv: abv = abv.group() ''' Filters for a number with or without a decimal pt ''' abv = float(re.search('(\d+[.]?\d*)', abv).group()) ''' If new ABV is 0.0, return previously found ABV if any otherwise, move onto the next link ''' if abv == 0.0: if found_abv: if self.verbose: print('ABV for {} is {}'.format(self.name, abv)) else: continue if abv < max_abv: if abv < max_abv / 2: if self.verbose: print('ABV for {} is {}'.format(self.name, abv)) return abv ''' Replace the new ABV only if the next is lower ''' if found_abv: if abv < found_abv: if self.verbose: print('ABV for {} is {}'.format(self.name, abv)) return abv else: if self.verbose: print('ABV for {} is {}\ '.format(self.name, found_abv)) return found_abv ''' Sets the new ABV to the found ABV ''' found_abv = abv else: if found_abv: if self.verbose: print('ABV for {} is {}'.format(self.name, found_abv)) return found_abv ''' No ABV was found by this point ''' if self.verbose: print('ABV not found for {}'.format(self.name)) return None
pos = doc.find('</p>', pos) elif tag == '>SEÇÃO': secao += 1 subsecao = 0 pos = doc.find('</p>', pos) elif tag == '>Subseção': subsecao += 1 pos = doc.find('</p>', pos) elif tag[:3] == 'Art': artigo += 1 idt += 1 subartigo, inciso, paragrafo = 0, 0, 0 t = get_text(doc, pos) g.writerow([idt, livro, titulo, capitulo, secao, subsecao, artigo, subartigo, paragrafo, inciso, alinea, t]) pos = doc.find('</p>', pos) elif tag == 'SubArtigo': i = pos tmp = '' while doc[i] != '-': if doc[i] in '0987654321': tmp += doc[i] i += 1 if int(tmp) != artigo: artigo = int(tmp) subartigo += 1
def run(self, edit): utils.set_text(sublime.encode_value(sublime.decode_value(utils.get_text()), True))
def train(config, sample_validation_batches): source_language = config.get('src_language') target_language = config.get('trg_language') EOS_token = config.get('EOS_token') PAD_token = config.get('PAD_token') SOS_token = config.get('SOS_token') train_iter = config.get('train_iter') val_iter = config.get('val_iter') writer_path = config.get('writer_path') writer_train_path = get_or_create_dir(writer_path, 'train') writer_val_path = get_or_create_dir(writer_path, 'val') writer_train = SummaryWriter(log_dir=writer_train_path) writer_val = SummaryWriter(log_dir=writer_val_path) epochs = config.get('epochs') training = config.get('training') eval_every = training.get('eval_every') sample_every = training.get('sample_every') use_attention = config.get('use_attention') step = 1 for epoch in range(epochs): print(f'Epoch: {epoch+1}/{epochs}') save_weights(config) for i, training_batch in enumerate(train_iter): loss = train_batch(config, training_batch) writer_train.add_scalar('loss', loss, step) if step == 1 or step % eval_every == 0: val_lengths = 0 val_losses = 0 reference_corpus = [] translation_corpus = [] for val_batch in val_iter: val_loss, translations = evaluate_batch(config, val_batch) val_lengths += 1 val_losses += val_loss val_batch_trg, _ = val_batch.trg _, batch_size = val_batch_trg.shape references = map( lambda i: torch2words(target_language, val_batch_trg[:, i]), range(batch_size)) references = map( lambda words: [ list( filter_words(words, SOS_token, EOS_token, PAD_token)) ], references) reference_corpus.extend(references) translations = map( lambda translation: list2words( target_language, translation), translations) translations = map( lambda words: list( filter_words(words, SOS_token, EOS_token, PAD_token )), translations) translation_corpus.extend(translations) bleu = compute_bleu(reference_corpus, translation_corpus) val_loss = val_losses / val_lengths writer_val.add_scalar('bleu', bleu, step) writer_val.add_scalar('loss', val_loss, step) if step % sample_every == 0: val_batch = sample_validation_batches(1) val_batch_src, val_lengths_src = val_batch.src val_batch_trg, _ = val_batch.trg s0 = val_lengths_src[0].item() _, translations, attention_weights = evaluate_batch( config, val_batch, True) source_words = torch2words(source_language, val_batch_src[:, 0]) target_words = torch2words(target_language, val_batch_trg[:, 0]) translation_words = list( filter(lambda word: word != PAD_token, list2words(target_language, translations[0]))) if use_attention and sum(attention_weights.shape) != 0: attention_figure = visualize_attention( source_words[:s0], translation_words, with_cpu(attention_weights)) writer_val.add_figure('attention', attention_figure, step) text = get_text(source_words, target_words, translation_words, SOS_token, EOS_token, PAD_token) writer_val.add_text('translation', text, step) step += 1 save_weights(config)
def extract_infolists(filename, lang='sk', mode='regular', webpages={}, verbose=True): """Extract all infolists with all of their courses from a study program XML file. Params: filename: path to the XML file lang: language Returns: list of infolists with cou dics """ xmldoc = ET.parse(filename) root = xmldoc.getroot() organizacnaJednotka = root.find('organizacnaJednotka').text vysokaSkola = root.find('vysokaSkola').text fakulta = root.find('fakulta').text ilisty = root.find('informacneListy') if verbose: print " Nasiel som %d informacnych listov." % len(ilisty.findall('informacnyList')) # elementy, ktore sa budu parsovat z XML-ka # kluc => XPath (kluc sa pouziva neskor v template) elements = {'kod': 'kod', 'nazov': 'nazov', 'kredit': 'kredit', 'sposobVyucby': 'sposobVyucby', 'rozsahTyzdenny': 'rozsahTyzdenny', 'rozsahSemestranly': 'rozsahSemestranly', 'rokRocnikStudPlan': 'rokRocnikStudPlan', 'kodSemesterStudPlan': 'kodSemesterStudPlan', 'sposobUkoncenia': 'sposobUkoncenia', 'studijnyProgram': 'studijneProgramy/studijnyProgram/popis', 'podmienujucePredmety': 'podmienujucePredmety', 'vylucujucePredmety': 'vylucujucePredmety', 'doplujuceUdaje': 'doplujuceUdaje', 'zabezpecuju': 'zabezpecuju', 'strucnaOsnova': '_SO_/texty', 'ciel': '_C_/texty', 'zaverecneHodnotenie': '_Z_/texty/p', 'literatura': '_L_/texty', 'priebezneHodnotenie': '_P_/texty/p', 'obsahovaPrerekvizita': '_O_/texty', 'sylabus': '_S_/texty', 'datumSchvalenia': 'datumSchvalenia', 'vahaHodnotenia': '_VH_/texty/p', 'garanti': 'garanti/garant/plneMeno', 'jazyk': '_PJ_/texty/p', 'obsahovaNapln': '_ON_/texty', 'podmienkyAbsolvovania': '_PA_/texty', 'vysledkyVzdelavania': '_VV_/texty' } data = [] # spracovanie informacnych listov jednotlivych predmetov for il in ilisty.findall('informacnyList'): # preskocime statne skusky, tie sa spracuvaju inym skriptom if mode=='regular' and (il.find('_ON_') is not None): continue if mode=='statnice' and (il.find('_ON_') is None): continue d = {'lang' : lang, 'organizacnaJednotka': organizacnaJednotka, 'vysokaSkola': vysokaSkola, 'fakulta': fakulta } for key, path in elements.iteritems(): if il.find(path) is not None: if key != 'vahaHodnotenia' and path.startswith('_'): d[key] = utils.get_text(il.find(path)) elif key in ['studijnyProgram', 'jazyk']: d[key] = [el.text for el in il.findall(path)] if key == 'jazyk': d[key] = list(set(d[key])) else: d[key] = il.find(path).text else: d[key] = '' # uprava kodov predmetov d['kod'] = utils.parse_code(d['kod']) # domovska stranka predmetu if d['kod'] in webpages: d['webStranka'] = webpages[d['kod']] data.append(d) return data