def __init__( self, rhythm_type='', step='', octave='', staff='', is_note=True, bs_node=None): if bs_node: if bs_node('rest'): self.is_note = False else: self.is_note = True self.bs_node = bs_node if self.is_note: self.rhythm_type = extract_text(bs_node, 'type') self.step = extract_text(bs_node, 'pitch step') self.octave = int(extract_text(bs_node, 'pitch octave')) self.is_chord = bool(bs_node('chord')) else: self.rhythm_type = extract_text(bs_node, 'type') self.step = None self.octave = None self.is_chord = None self.staff = int(extract_text(bs_node, 'staff')) self.is_hidden = is_hidden(bs_node) else: self.rhythm_type = rhythm_type self.step = step self.octave = octave self.staff = staff self.is_note = is_note
def test5(): ret = '' ret += '------------\n' ret += 'start, end, greedy=True\n' ret += util.extract_text(text2, r'start', 'end', greedy=True) ret += '\n' ret += '------------\n' ret += 'start, end, greedy=False\n' ret += util.extract_text(text2, r'start', 'end', greedy=False) ret += '\n' return ret
def test7(): ret = '' ret += '------------\n' ret += 'test7\n' ret += '------------\n' ret += 'start, end, greedy=True, count=1\n' ret += util.extract_text(text4, r'start', 'end', greedy=True, count=1) ret += '\n' ret += '------------\n' ret += 'start, end, greedy=True, count=2\n' ret += util.extract_text(text4, r'start', 'end', greedy=True, count=2) ret += '\n' return ret
def test4(): ret = '' ret += '------------\n' ret += 'test4\n' ret += '------------\n' ret += 'start, 123\n' ret += util.extract_text(text1, r'start', '123') ret += '\n' ret += '------------\n' ret += '123, start\n' ret += util.extract_text(text1, r'123', 'start') ret += '\n' return ret
def _search(keywords): import search hits = [] results = search.fts.search(keywords, limit=10) db = database.Dao() try: for hit in results: try: news = db.get_news(hit['news_id']) text = util.extract_text(news[5]) summary = hit.highlights('content', text=text, top=2) hits.append(dict( image_public_url=news[8], share_url=news[3], date=news[4], title=news[2], summary=summary, )) except Exception, e: stack = traceback.format_exc() logging.error("one hit error: %s\n%s" % (e, stack)) finally: db.close() return hits
def _search(keywords): import search hits = [] fts_searcher = search.FTSSearcher() results = fts_searcher.search(keywords, limit=10) db = database.Dao() try: for hit in results: try: news = db.get_news(hit['news_id']) text = util.extract_text(news[5]) summary = hit.highlights('content', text=text, top=2) hits.append(dict( image_public_url=news[8], share_url=news[3], date=news[4], title=news[2], summary=summary, )) except Exception, e: stack = traceback.format_exc() logging.error("one hit error: %s\n%s" % (e, stack)) finally: db.close() fts_searcher.close() return hits
def _index_news_list(news_list): from search import fts news_docs = [] for news in news_list: body_text = util.extract_text(news.get('body', '')) news_docs.append( dict(news_id=news['id'], title=news['title'], content=body_text)) fts.add_many_docs(news_docs)
def test8(): ret = '' ret += '------------\n' ret += 'test8\n' ret += '------------\n' ret += 'start=None\n' ret += util.extract_text(text1, start=None, end='end') ret += '\n' ret += '------------\n' ret += 'end=None\n' ret += util.extract_text(text1, start='start', end=None) ret += '\n' ret += '------------\n' ret += 'start=None, end=None\n' ret += util.extract_text(text1, start=None, end=None) ret += '\n' return ret
def test2(): ret = '---------------------------------------' ret += '------------\n' ret += 'test2\n' ret += '------------\n' ret += 'nostart, end\n' ret += util.extract_text(text1, 'nostart', 'end') ret += '\n' ret += '------------\n' ret += 'start, noend\n' ret += util.extract_text(text1, 'start', 'noend') ret += '\n' ret += '------------\n' ret += 'nostart, noend\n' ret += util.extract_text(text1, 'nostart', 'noend') ret += '\n' return ret
def word_list(lst_dic, type=None): if not type: return " ".join(lst_dic) else: if type == "text": text = util.extract_text(lst_dic) if type == "hashtag": text = util.extract_hash(lst_dic) return " ".join(text)
def _index_news_list(news_list): import search fts_indexer = search.FTSIndexer() news_docs = [] for news in news_list: body_text = util.extract_text(news.get("body", "")) news_docs.append(dict(news_id=news["news_id"], title=news["title"], content=body_text)) fts_indexer.add_many_docs(news_docs)
def test3(): ret = '' ret += '------------\n' ret += 'test3\n' ret += '------------\n' ret += '\\d, \\d\\s\\d\n' ret += util.extract_text(text1, r'\d', r'\d\s\d') ret += '\n' return ret
def _index_news_list(news_list): import search fts_indexer = search.FTSIndexer() news_docs = [] for news in news_list: body_text = util.extract_text(news.get('body', '')) news_docs.append( dict(news_id=news['news_id'], title=news['title'], content=body_text)) fts_indexer.add_many_docs(news_docs)
def test1(): ret = '' ret += '------------\n' ret += 'test1\n' ret += '------------\n' ret += 'start, end\n' ret += util.extract_text(text1, 'start', 'end') ret += '\n' ret += '------------\n' ret += 'start, end, True, False\n' ret += util.extract_text(text1, 'start', 'end', True, False) ret += '\n' ret += '------------\n' ret += 'start, end, False, True\n' ret += util.extract_text(text1, 'start', 'end', False, True) ret += '\n' ret += '------------\n' ret += 'start, end, False, False\n' ret += util.extract_text(text1, 'start', 'end', False, False) ret += '\n' return ret
def fetchByUrl(NEWS_URL, language): articles = [] dom = minidom.parseString(util.http_request(NEWS_URL)) for node in dom.getElementsByTagName('item'): # the source is the last part and the rest is the title arr = node.getElementsByTagName('title')[0].firstChild.data.rsplit('-',1) newArticle = common.Article() newArticle.title = arr[0].strip() logging.debug("Parsing article: '%s'"%newArticle.title) #replace the ' mark to its html encoding and save in source newArticle.source = arr[1].strip() url = node.getElementsByTagName('link')[0].firstChild.data newArticle.url = url # Here we test if this article is already in the DB, and if so we continue: if (isURLInDB(url)): logging.debug("Article already in the DB, skipping") continue #extract text rawDescription = node.getElementsByTagName('description')[0].firstChild.data #newArticle.raw = rawDescription # used for debugging description = util.extract_text(rawDescription).split("...")[0].strip() # get all the text before the ... if (description.find(newArticle.source) > -1): newArticle.desc = description.split(newArticle.source)[1] else: newArticle.desc = description datestring = node.getElementsByTagName('pubDate')[0].firstChild.data if datestring != '': newArticle.created = datetime.datetime.strptime(datestring, '%a, %d %b %Y %H:%M:%S GMT+00:00' ) soup = BeautifulSoup(rawDescription) thumbnail = soup.find('img') if thumbnail: try: newArticle.pic_url = thumbnail['src'] except: pass #@@ari newArticle.language = language articles.append(newArticle) return articles
def fetchByUrlYT(NEWS_URL, language): articles = [] dom = minidom.parseString(util.http_request(NEWS_URL)) for node in dom.getElementsByTagName('entry'): # the source is the last part and the rest is the title newArticle = common.Article() newArticle.title = node.getElementsByTagName('title')[0].firstChild.data logging.debug("Parsing article: '[YouTube] %s'"%newArticle.title) #replace the ' mark to its html encoding and save in source newArticle.source = "YouTube" url = node.getElementsByTagName('link')[0].getAttribute('href') newArticle.url = url # Here we test if this article is already in the DB, and if so we continue: if (isURLInDB(url)): logging.debug("Article already in the DB, skipping") continue #extract text rawDescription = node.getElementsByTagName('content')[0].firstChild.data #newArticle.raw = rawDescription # used for debugging #description = rawDescription.split('tyle="font-size: 12px; margin: 3px 0px;"><span>')[1].split('</span></div></td>')[0].strip() # get all the text before the ... description = util.extract_text(rawDescription).strip()[len(newArticle.title):] newArticle.desc = description datestring = node.getElementsByTagName('updated')[0].firstChild.data if datestring != '': newArticle.created = datetime.datetime.strptime(datestring, '%Y-%m-%dT%H:%M:%S.000Z' ) thumbnail = re.findall(r'img alt="" src="(http://i.ytimg.com/[^"]+)"',rawDescription) if len(thumbnail) > 0: newArticle.pic_url = thumbnail[0] #@@ari newArticle.language = language articles.append(newArticle) return articles
paths.append(path) if os.path.isfile(path): paths.append(path) mode = "JSON" for arg in sys.argv[1:]: if arg == "--markdown": mode = "MD" continue map(ingest, glob.glob(arg)) data = [] for level_path in paths: try: level_text = extract_text(level_path) except UnknownFileHeader: continue if level_text: data.append({ "level" : os.path.split(level_path)[-1], "text" : map(lambda x: x.encode("utf-8"), level_text), }) if mode == "JSON": print json.dumps(data, ensure_ascii=False) elif mode == "MD": markdown = u'' for packet in data: level = packet["level"]
image_height, image_width = image.shape[:2] image = image.reshape((image_height, image_width, 1)) input = np.expand_dims(image, axis=0) y_pred = base_model.predict(input) shape = y_pred[:, :, :].shape ctc_decode = K.ctc_decode(y_pred[:, :, :], input_length=np.ones(shape[0]) * shape[1])[0][0] out = K.get_value(ctc_decode) # out = K.get_value(ctc_decode)[:, :11] print('image_path={}'.format(image_path)) result1 = ''.join([chars[idx - 1] for idx in out[0]]) print('my_result={}'.format(result1)) # cv2.imshow('image', image) # cv2.waitKey(0) image = cv2.imread(image_path) result2 = extract_text(image) print('tesseract_result={}'.format(result2)) height, width = image.shape[:2] image = np.insert(image, [height] * 300, values=255, axis=0) image = np.insert(image, [width] * 200, values=255, axis=1) image_PIL = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB)) # font = ImageFont.truetype('NotoSansCJK-Black.ttc', 20) font = ImageFont.truetype('simsun.ttc', 20) # 字体颜色 fill_color = (255, 0, 0) # 文字输出位置 position1 = (10, 100) position2 = (10, 200) draw = ImageDraw.Draw(image_PIL) draw.text(position1, "okra:{}".format(result1), font=font, fill=fill_color) draw.text(position2,