Python extract_text示例，util.extract_text Python示例

示例#1

0

显示文件

文件： note.py 项目： themichaellai/musicxml-stats

 def __init__(
         self, rhythm_type='', step='', octave='', staff='', is_note=True,
         bs_node=None):
     if bs_node:
         if bs_node('rest'):
             self.is_note = False
         else:
             self.is_note = True
         self.bs_node = bs_node
         if self.is_note:
             self.rhythm_type = extract_text(bs_node, 'type')
             self.step = extract_text(bs_node, 'pitch step')
             self.octave = int(extract_text(bs_node, 'pitch octave'))
             self.is_chord = bool(bs_node('chord'))
         else:
             self.rhythm_type = extract_text(bs_node, 'type')
             self.step = None
             self.octave = None
             self.is_chord = None
         self.staff = int(extract_text(bs_node, 'staff'))
         self.is_hidden = is_hidden(bs_node)
     else:
         self.rhythm_type = rhythm_type
         self.step = step
         self.octave = octave
         self.staff = staff
         self.is_note = is_note

示例#2

0

显示文件

def test5():
    ret = ''
    ret += '------------\n'
    ret += 'start, end, greedy=True\n'
    ret += util.extract_text(text2, r'start', 'end', greedy=True)
    ret += '\n'
    ret += '------------\n'
    ret += 'start, end, greedy=False\n'
    ret += util.extract_text(text2, r'start', 'end', greedy=False)
    ret += '\n'
    return ret

示例#3

0

显示文件

def test7():
    ret = ''
    ret += '------------\n'
    ret += 'test7\n'
    ret += '------------\n'
    ret += 'start, end, greedy=True, count=1\n'
    ret += util.extract_text(text4, r'start', 'end', greedy=True, count=1)
    ret += '\n'
    ret += '------------\n'
    ret += 'start, end, greedy=True, count=2\n'
    ret += util.extract_text(text4, r'start', 'end', greedy=True, count=2)
    ret += '\n'
    return ret

示例#4

0

显示文件

def test4():
    ret = ''
    ret += '------------\n'
    ret += 'test4\n'
    ret += '------------\n'
    ret += 'start, 123\n'
    ret += util.extract_text(text1, r'start', '123')
    ret += '\n'
    ret += '------------\n'
    ret += '123, start\n'
    ret += util.extract_text(text1, r'123', 'start')
    ret += '\n'
    return ret

示例#5

0

显示文件

def _search(keywords):
    import search
    hits = []
    results = search.fts.search(keywords, limit=10)
    db = database.Dao()
    try:
        for hit in results:
            try:
                news = db.get_news(hit['news_id'])
                text = util.extract_text(news[5])
                summary = hit.highlights('content', text=text, top=2)
                hits.append(dict(
                    image_public_url=news[8],
                    share_url=news[3],
                    date=news[4],
                    title=news[2],
                    summary=summary,
                ))
            except Exception, e:
                stack = traceback.format_exc()
                logging.error("one hit error: %s\n%s" % (e, stack))
    finally:
        db.close()

    return hits

示例#6

0

显示文件

文件： handler.py 项目： michellewkx/zhihudaily

def _search(keywords):
    import search
    hits = []
    fts_searcher = search.FTSSearcher()
    results = fts_searcher.search(keywords, limit=10)

    db = database.Dao()
    try:
        for hit in results:
            try:
                news = db.get_news(hit['news_id'])
                text = util.extract_text(news[5])
                summary = hit.highlights('content', text=text, top=2)
                hits.append(dict(
                    image_public_url=news[8],
                    share_url=news[3],
                    date=news[4],
                    title=news[2],
                    summary=summary,
                ))
            except Exception, e:
                stack = traceback.format_exc()
                logging.error("one hit error: %s\n%s" % (e, stack))
    finally:
        db.close()
        fts_searcher.close()

    return hits

示例#7

0

显示文件

文件： operation.py 项目： hochipun/zhihudaily

def _index_news_list(news_list):
    from search import fts
    news_docs = []
    for news in news_list:
        body_text = util.extract_text(news.get('body', ''))
        news_docs.append(
            dict(news_id=news['id'], title=news['title'], content=body_text))
    fts.add_many_docs(news_docs)

示例#8

0

显示文件

def test8():
    ret = ''
    ret += '------------\n'
    ret += 'test8\n'
    ret += '------------\n'
    ret += 'start=None\n'
    ret += util.extract_text(text1, start=None, end='end')
    ret += '\n'
    ret += '------------\n'
    ret += 'end=None\n'
    ret += util.extract_text(text1, start='start', end=None)
    ret += '\n'
    ret += '------------\n'
    ret += 'start=None, end=None\n'
    ret += util.extract_text(text1, start=None, end=None)
    ret += '\n'
    return ret

示例#9

0

显示文件

def test2():
    ret = '---------------------------------------'
    ret += '------------\n'
    ret += 'test2\n'
    ret += '------------\n'
    ret += 'nostart, end\n'
    ret += util.extract_text(text1, 'nostart', 'end')
    ret += '\n'
    ret += '------------\n'
    ret += 'start, noend\n'
    ret += util.extract_text(text1, 'start', 'noend')
    ret += '\n'
    ret += '------------\n'
    ret += 'nostart, noend\n'
    ret += util.extract_text(text1, 'nostart', 'noend')
    ret += '\n'
    return ret

示例#10

0

显示文件

def word_list(lst_dic, type=None):
    if not type:
        return " ".join(lst_dic)
    else:
        if type == "text":
            text = util.extract_text(lst_dic)
        if type == "hashtag":
            text = util.extract_hash(lst_dic)
        return " ".join(text)

示例#11

0

显示文件

文件： operation.py 项目： michellewkx/zhihudaily

def _index_news_list(news_list):
    import search

    fts_indexer = search.FTSIndexer()
    news_docs = []
    for news in news_list:
        body_text = util.extract_text(news.get("body", ""))
        news_docs.append(dict(news_id=news["news_id"], title=news["title"], content=body_text))
    fts_indexer.add_many_docs(news_docs)

示例#12

0

显示文件

def test3():
    ret = ''
    ret += '------------\n'
    ret += 'test3\n'
    ret += '------------\n'
    ret += '\\d, \\d\\s\\d\n'
    ret += util.extract_text(text1, r'\d', r'\d\s\d')
    ret += '\n'
    return ret

示例#13

0

显示文件

def _index_news_list(news_list):
    import search
    fts_indexer = search.FTSIndexer()
    news_docs = []
    for news in news_list:
        body_text = util.extract_text(news.get('body', ''))
        news_docs.append(
            dict(news_id=news['news_id'],
                 title=news['title'],
                 content=body_text))
    fts_indexer.add_many_docs(news_docs)

示例#14

0

显示文件

def test1():
    ret = ''
    ret += '------------\n'
    ret += 'test1\n'
    ret += '------------\n'
    ret += 'start, end\n'
    ret += util.extract_text(text1, 'start', 'end')
    ret += '\n'
    ret += '------------\n'
    ret += 'start, end, True, False\n'
    ret += util.extract_text(text1, 'start', 'end', True, False)
    ret += '\n'
    ret += '------------\n'
    ret += 'start, end, False, True\n'
    ret += util.extract_text(text1, 'start', 'end', False, True)
    ret += '\n'
    ret += '------------\n'
    ret += 'start, end, False, False\n'
    ret += util.extract_text(text1, 'start', 'end', False, False)
    ret += '\n'
    return ret

示例#15

0

显示文件

文件： feedparser.py 项目： arisha84/IlikeiL

def fetchByUrl(NEWS_URL, language):
    articles = []
    dom = minidom.parseString(util.http_request(NEWS_URL))
    for node in dom.getElementsByTagName('item'):
        # the source is the last part and the rest is the title
        arr = node.getElementsByTagName('title')[0].firstChild.data.rsplit('-',1) 
        newArticle = common.Article()
        newArticle.title = arr[0].strip()
        logging.debug("Parsing article: '%s'"%newArticle.title)
        #replace the ' mark to its html encoding and save in source
        newArticle.source = arr[1].strip()
        url = node.getElementsByTagName('link')[0].firstChild.data
        newArticle.url = url 

        # Here we test if this article is already in the DB, and if so we continue:
        if (isURLInDB(url)):
            logging.debug("Article already in the DB, skipping")
            continue

        #extract text
        rawDescription = node.getElementsByTagName('description')[0].firstChild.data
        #newArticle.raw = rawDescription # used for debugging
        description = util.extract_text(rawDescription).split("...")[0].strip() # get all the text before the ...
        if (description.find(newArticle.source) > -1):
            newArticle.desc = description.split(newArticle.source)[1]
        else:
            newArticle.desc = description
        datestring = node.getElementsByTagName('pubDate')[0].firstChild.data
        if datestring != '':
            newArticle.created = datetime.datetime.strptime(datestring, '%a, %d %b %Y %H:%M:%S GMT+00:00' )
        
        soup = BeautifulSoup(rawDescription)
        thumbnail = soup.find('img')
        if thumbnail:
            try:
                newArticle.pic_url = thumbnail['src']
            except:
                pass 
        #@@ari
        newArticle.language = language
        articles.append(newArticle)
    
    return articles

示例#16

0

显示文件

文件： feedparser.py 项目： arisha84/IlikeiL

def fetchByUrlYT(NEWS_URL, language):
    articles = []
    dom = minidom.parseString(util.http_request(NEWS_URL))
    for node in dom.getElementsByTagName('entry'):
        # the source is the last part and the rest is the title
        newArticle = common.Article()
        newArticle.title = node.getElementsByTagName('title')[0].firstChild.data
        logging.debug("Parsing article: '[YouTube] %s'"%newArticle.title)
        #replace the ' mark to its html encoding and save in source
        newArticle.source = "YouTube"
        url = node.getElementsByTagName('link')[0].getAttribute('href')
        newArticle.url = url

        # Here we test if this article is already in the DB, and if so we continue:
        if (isURLInDB(url)):
            logging.debug("Article already in the DB, skipping")
            continue

        #extract text
        rawDescription = node.getElementsByTagName('content')[0].firstChild.data
        #newArticle.raw = rawDescription # used for debugging
        #description = rawDescription.split('tyle="font-size: 12px; margin: 3px 0px;"&gt;&lt;span&gt;')[1].split('&lt;/span&gt;&lt;/div&gt;&lt;/td&gt;')[0].strip() # get all the text before the ...
        description = util.extract_text(rawDescription).strip()[len(newArticle.title):]
        newArticle.desc = description
        datestring = node.getElementsByTagName('updated')[0].firstChild.data
        if datestring != '':
            newArticle.created = datetime.datetime.strptime(datestring, '%Y-%m-%dT%H:%M:%S.000Z' )

        thumbnail = re.findall(r'img alt="" src="(http://i.ytimg.com/[^"]+)"',rawDescription)
        if len(thumbnail) > 0:
            newArticle.pic_url = thumbnail[0]

        #@@ari
        newArticle.language = language
        articles.append(newArticle)

    return articles

示例#17

0

显示文件

文件： extract_text.py 项目： morristech/nw-converter

                    paths.append(path)
            
        if os.path.isfile(path):
            paths.append(path)

    mode = "JSON"
    for arg in sys.argv[1:]:
        if arg == "--markdown":
            mode = "MD"
            continue
        map(ingest, glob.glob(arg))

    data = []
    for level_path in paths:
        try:
            level_text = extract_text(level_path)
        except UnknownFileHeader:
            continue
        if level_text:
            data.append({
                "level" : os.path.split(level_path)[-1],
                "text" : map(lambda x: x.encode("utf-8"), level_text),
            })

    if mode == "JSON":
        print json.dumps(data, ensure_ascii=False)

    elif mode == "MD":
        markdown = u''
        for packet in data:
            level = packet["level"]

示例#18

0

显示文件

 image_height, image_width = image.shape[:2]
 image = image.reshape((image_height, image_width, 1))
 input = np.expand_dims(image, axis=0)
 y_pred = base_model.predict(input)
 shape = y_pred[:, :, :].shape
 ctc_decode = K.ctc_decode(y_pred[:, :, :],
                           input_length=np.ones(shape[0]) * shape[1])[0][0]
 out = K.get_value(ctc_decode)
 # out = K.get_value(ctc_decode)[:, :11]
 print('image_path={}'.format(image_path))
 result1 = ''.join([chars[idx - 1] for idx in out[0]])
 print('my_result={}'.format(result1))
 # cv2.imshow('image', image)
 # cv2.waitKey(0)
 image = cv2.imread(image_path)
 result2 = extract_text(image)
 print('tesseract_result={}'.format(result2))
 height, width = image.shape[:2]
 image = np.insert(image, [height] * 300, values=255, axis=0)
 image = np.insert(image, [width] * 200, values=255, axis=1)
 image_PIL = Image.fromarray(cv2.cvtColor(image, cv2.COLOR_BGR2RGB))
 # font = ImageFont.truetype('NotoSansCJK-Black.ttc', 20)
 font = ImageFont.truetype('simsun.ttc', 20)
 # 字体颜色
 fill_color = (255, 0, 0)
 # 文字输出位置
 position1 = (10, 100)
 position2 = (10, 200)
 draw = ImageDraw.Draw(image_PIL)
 draw.text(position1, "okra:{}".format(result1), font=font, fill=fill_color)
 draw.text(position2,