Python remove_tags示例，utils.remove_tags Python示例

示例#1

0

显示文件

文件： app.py 项目： richardchien-archive/wechat-intercom

def handle_conversation_replied(ctx):
    user_id = ctx['data']['item']['user']['user_id']
    user_id_parts = user_id.split('/')

    message = ctx['data']['item']['conversation_parts']['conversation_parts'][
        0]
    image_urls = [
        m.group(1)
        for m in re.finditer(r'<\s*img\s+src="(http.+?)"\s*>', message['body'])
    ]

    if user_id_parts[0] == 'wechat':
        wechat_client, wechat_id = user_id_parts[1:]
        wechat_client_encoded = urlencode(wechat_client)
        for url in image_urls:
            wechat.send_friend_message(client=wechat_client_encoded,
                                       id=wechat_id,
                                       media_path=url)
        wechat.send_friend_message(client=wechat_client_encoded,
                                   id=wechat_id,
                                   content=remove_tags(
                                       message['body']).strip())
    elif 'INTERCOM_BOT_USER_ID' in app.config \
            and user_id == app.config['INTERCOM_BOT_USER_ID']:
        handle_admin_commands(app.config['INTERCOM_BOT_USER_ID'],
                              remove_tags(message['body']).strip())

示例#2

0

显示文件

文件： views.py 项目： cmbi/whynot

def entries_file():
    # TODO: speed up this method
    collection = request.args.get('collection')
    databank_name = request.args.get('databank')
    comment_text = request.args.get('comment')

    # listing determines what is shown per entry(pdb ids, databank names, comments, file names, etc.)
    listing = request.args.get('listing')

    _log.info("request for entries file %s %s %s %s" %(collection, databank_name, comment_text, listing))

    if not listing:
        return ''

    listing = listing.lower()

    entries = []
    name="0"
    if databank_name and collection:

        entries = get_entries_from_collection(databank_name, collection)
        name = "%s%s" %(databank_name, collection)

    elif databank_name and comment_text:

        entries = get_entries_with_comment(databank_name, comment_text)
        name = "%s%s" %(databank_name, remove_tags(comment_text))

    elif comment_text:

        entries = get_all_entries_with_comment(comment_text)
        name = remove_tags(comment_text)

    text = ''
    if listing == 'comments':

        d = {}
        for entry in entries:
            if 'comment' in entry:
                c = entry ['comment']
                if c not in d:
                    d [c] = ''
                d [c] += '%s,%s\n' %(entry['databank_name'], entry['pdbid'])

        for comment in d:
            text += comment + ":\n" + d [comment]
    else:
        for entry in entries:

            if listing == 'pdbids':
                text += entry ['pdbid'] + '\n'
            elif listing == 'entries':
                text += '%s,%s\n' %(entry['databank_name'], entry ['pdbid'])
            elif listing == 'files' and 'filepath' in entry:
                text += '%s,%s,%s\n' %(entry['databank_name'], entry ['pdbid'], entry ['filepath'])

    response = Response(text, mimetype='text/plain')
    response.headers["Content-Disposition"] = "attachment; filename=%s" %('%s_%s' %(name, listing))

    return response

示例#3

0

显示文件

文件： semeval.py 项目： ShivaKumarChittamuru/pywsd

 def test_instances(self):
     """
     Returns the test instances from SemEval2007 Coarse-grain WSD task.
     
     >>> coarse_wsd = SemEval2007_Coarse_WSD()
     >>> inst2ans = coarse_wsd.get_answers()
     >>> for inst in inst2ans:
     ...    print inst, inst2ans[inst]
     ...    break
     d004.s073.t013 answer(sensekey=[u'pointer%1:06:01::', u'pointer%1:06:00::', u'pointer%1:10:00::'], lemma=u'pointer', pos=u'n')
     """
     Instance = namedtuple('instance', 'id, lemma, word')
     test_file = io.open(self.test_file, 'r').read()
     inst2ans = self.get_answers()
     
     for text in bsoup(test_file).findAll('text'):
         textid = text['id']
         document = " ".join([remove_tags(i) for i in str(text).split('\n') 
                              if remove_tags(i)])
         for sent in text.findAll('sentence'):
             sentence =  " ".join([remove_tags(i) for i in 
                                   str(sent).split('\n') if remove_tags(i)])
             for instance in sent.findAll('instance'):
                 instid = instance['id']
                 lemma = instance['lemma']
                 word = instance.text
                 inst = Instance(instid, lemma, word)
                 yield inst, inst2ans[instid], unicode(sentence), unicode(document)

示例#4

0

显示文件

文件： odin.py 项目： susfert/sugali

def pickle2plaintext(testing=False, option="cleanest"):
    """ Converted ODIN IGTs from the .pk file into tab-delimited plaintexts."""
    # Makes a temp output directory for the individual files.
    TEMPODIN_DIR = "./tmpodin/"  # for saving the temp udhr files.
    if not os.path.exists(TEMPODIN_DIR):
        os.makedirs(TEMPODIN_DIR)

    for language, documents in sorted(load_odin_pickle()):
        tab_igts = []
        for d in documents:
            if d[0].strip() == "":
                continue
            src = remove_tags(d[0])
            # Removes heading bullets, e.g. (1)... | 1) | ( 12 ) | i. ... | A2. ...
            src = re.sub(r"^\(?\s?\w{1,5}\s*[):.]\s*", "", src)
            src = re.sub(r"^\(?\w{1,5}\s*[):.]\s*", "", src)
            src = re.sub(r"^\(?\w{1,5}\s*[):.]\s*", "", src)
            morphemes = src
            # Joins the morphemes up into words.
            words = re.sub(" *- *", "", src)

            if option == "cleanest":  # Accepts only IGTs without punctuation.
                if src == "" or any(i for i in string.punctuation if i in src):
                    continue
            elif option == "cleaner":  # Removes the example number at the end.
                patterns = [r"\(.{1,}\)", r"[\(\)]"]
                for pat in patterns:
                    src = re.sub(pat, "", src)
            else:  # Accepts IGTs as they are.
                if src == "":
                    continue

            # src, eng, gloss, cite = d[0], d[1], d[2], d[3]
            tab_igts.append([words, morphemes, remove_tags(d[1]), remove_tags(d[2]), d[3]])
        if len(tab_igts) > 0:
            with codecs.open(TEMPODIN_DIR + "odin-" + language + ".txt", "w", "utf8") as fout:
                for igt in tab_igts:
                    print >> fout, "\t".join(igt)

        if testing:
            break

    if testing:
        # Compress the utf8 UDHR files into a single tarfile in the test dir.
        try:
            make_tarfile("test/odin-" + option + ".tar", TEMPODIN_DIR)
        except IOError:
            # if function is called within the sugarlike/src/universalcorpus dir
            # To move up directory to access sugarlike/data/ and sugarlike/test/.
            make_tarfile("../test/odin-" + option + ".tar", TEMPODIN_DIR)
    else:
        # Compresses the utf8 UDHR files into a single tarfile.
        try:
            make_tarfile("../data/odin/odin-" + option + ".tar", TEMPODIN_DIR)
        except IOError:
            # if function is called within the sugarlike/src/universalcorpus dir
            # To move up directory to access sugarlike/data/ and sugarlike/test/.
            make_tarfile("../data/odin/odin-" + option + ".tar", TEMPODIN_DIR)
    # Remove the udhr-utf8 directory.
    shutil.rmtree(TEMPODIN_DIR)

示例#5

0

显示文件

文件： swotsanswer.py 项目： gallen/myswots-py

 def __init__(self, swots, testId, questionId, answer, timeSpent):
     feedback = swots.postJson("/quiz/users/" + str(swots.userId)
             + "/tests/" + str(testId)
             + "/feedback/" + str(questionId), {"answer": answer, "timeSpent": timeSpent})
     self.answer = feedback["answer"]
     self.solution = remove_tags(feedback["solution"]["solutionText"]).strip()
     self.answerText = remove_tags(feedback["options"][self.answer - 1])
     self.status = feedback["userStats"]["status"]

示例#6

0

显示文件

文件： routes.py 项目： jonblack/whynot

def entries_file():
    # TODO: speed up this method
    collection = request.args.get('collection')
    databank_name = request.args.get('databank')
    comment_text = request.args.get('comment')

    # listing determines what is shown per entry(pdb ids, databank names,
    # comments, file names, etc.)
    listing = request.args.get('listing')

    _log.info("request for entries file %s %s %s %s" % (
        collection, databank_name, comment_text, listing))

    if not listing:
        return ''

    listing = listing.lower()

    entries = []
    name = "0"
    if databank_name and collection:
        entries = get_entries_from_collection(databank_name, collection)
        name = "%s%s" % (databank_name, collection)
    elif databank_name and comment_text:
        entries = get_entries_with_comment(databank_name, comment_text)
        name = "%s%s" % (databank_name, remove_tags(comment_text))
    elif comment_text:
        entries = get_all_entries_with_comment(comment_text)
        name = remove_tags(comment_text)

    text = ''
    if listing == 'comments':
        d = {}
        for entry in entries:
            if 'comment' in entry:
                c = entry['comment']
                if c not in d:
                    d[c] = ''
                d[c] += '%s,%s\n' % (entry['databank_name'], entry['pdb_id'])
        for comment in d:
            text += comment + ":\n" + d[comment]
    else:
        for entry in entries:

            if listing == 'pdb_ids':
                text += entry['pdb_id'] + '\n'
            elif listing == 'entries':
                text += '%s,%s\n' % (entry['databank_name'], entry['pdb_id'])
            elif listing == 'files' and 'filepath' in entry:
                text += '%s,%s,%s\n' % (entry['databank_name'],
                                        entry['pdb_id'],
                                        entry['filepath'])

    response = Response(text, mimetype='text/plain')
    header_val = "attachment; filename=%s_%s" % (name, listing)
    response.headers["Content-Disposition"] = header_val

    return response

示例#7

0

显示文件

文件： ScheduleParser.py 项目： seaofgreen10/cfb_scheduler_ai

    def parseListToGamesInBlock(self, mix):
        teams_ls = []
        time = ""
        chan = ""
        listOfGames = []

        for item in mix:
            string_comp = str(item)
            string_comp = string_comp.replace(u'\xa0', ' ')
            validGame = 0

            if "Week " in string_comp:
                print("Week")
            elif " vs. " in string_comp:
                teams_ls = re.split(' (vs). ', string_comp)
                #print("VS: " + string_comp)
                #teams_ls[2] = teams_ls[2].split('(')[0]
                #print("t1: " + teams_ls[0] + " t2: " + teams_ls[2])
                validGame = 0
            elif (" at " in string_comp) and (not "Updated" in string_comp):
                tagless_team_line = utils.remove_tags(
                    str(item.contents[self.CONST_GAME_INDEX]))

                teams_ls = re.split(' (at) ', tagless_team_line)
                #print("AT: " )
                #print("AT CONTENTS: " + str(item.contents[1]))
                #print("t0 " + teams_ls[0] + " t1: " + teams_ls[1]+ " t2: " + teams_ls[2])

                # Time
                tagless_time_line = utils.remove_tags(
                    str(item.contents[self.CONST_TIME_INDEX]))
                time = tagless_time_line
                #print("Time: " + time)

                # Channel
                tagless_chan_line = utils.remove_tags(
                    str(item.contents[self.CONST_CHANNEL_INDEX]))
                chan = tagless_chan_line
                #print("Chan: " + chan)

                validGame = 1
            else:
                print("Error?")
                #print("Possible error parsing game: " + string_comp)

            if validGame:
                temp_game_info = GameInfo(teams_ls[0], teams_ls[2], time, chan)
                print("Adding GI: " + teams_ls[0] + " " + teams_ls[2] + " " +
                      time + " " + chan)
                print("Added: " + temp_game_info.getTeam1() + " " +
                      temp_game_info.getTeam2() + " " +
                      temp_game_info.getTimeStr() + " " +
                      temp_game_info.getChannel())
                listOfGames.append(
                    GameInfo(teams_ls[0], teams_ls[2], time, chan))

        return listOfGames

示例#8

0

显示文件

文件： semeval.py 项目： AbdelrahmanKhaled95/pywsd

 def yield_sentences(self):
     test_file = io.open(self.test_file, 'r').read()
     inst2ans = self.get_answers()        
     for text in bsoup(test_file).findAll('text'):
         if not text:
             continue
         textid = text['id']
         context_doc = " ".join([remove_tags(i) for i in 
                                 str(text).split('\n') if remove_tags(i)])
         for sent in text.findAll('sentence'):
             context_sent =  " ".join([remove_tags(i) for i in 
                                   str(sent).split('\n') if remove_tags(i)])
             yield sent, context_sent, context_doc, inst2ans, textid

示例#9

0

显示文件

文件： semeval.py 项目： SericWong/pywsd

 def yield_sentences(self):
     test_file = io.open(self.test_file, 'r').read()
     inst2ans = self.get_answers()        
     for text in bsoup(test_file).findAll('text'):
         if not text:
             continue
         textid = text['id']
         context_doc = " ".join([remove_tags(i) for i in 
                                 str(text).split('\n') if remove_tags(i)])
         for sent in text.findAll('sentence'):
             context_sent =  " ".join([remove_tags(i) for i in 
                                   str(sent).split('\n') if remove_tags(i)])
             yield sent, context_sent, context_doc, inst2ans, textid

示例#10

0

显示文件

    def _single_convert(self, input_file_object):
        if input_file_object:
            input_file_path = input_file_object.get_input_file_path()
            output_file_name = rename_filename_with_extension(
                os.path.basename(input_file_path), self.final_format)

            intermediate_filename = str(time.time()).replace('.', '') + '.html'
            output_file_path = os.path.join(self.tmp_dir, output_file_name)
            intermediate_path = os.path.join(
                self.tmp_dir, intermediate_filename)

            with codecs.open(input_file_path, "r", "utf-8") as f:
                cleaned_content = remove_tags(f.read())
                with open(intermediate_path, 'w') as w:
                    w.write(cleaned_content)

            converter = CONVERTER_LOCATION.format(
                input_file_path=intermediate_path,
                output_file_path=output_file_path)

            self.execute(converter)
            if os.path.isfile(output_file_path):
                return output_file_path
            else:
                self.handle_failed_conversion(input_file_object)

        log.error('Conversion failed from HTML => PDF')
        return None

示例#11

0

显示文件

文件： contact.py 项目： ilie05/ditchflow

def create():
    contacts = json.loads(request.form.get('contacts'))
    for contact in contacts:
        db_contact = Contact.query.filter_by(id=contact['id']).first()
        if db_contact:
            db_contact.name = remove_tags(contact['name'])
            db_contact.email = remove_tags(contact['email'])
            db_contact.cell_number = format_phone_number(contact['cell_number'])
            db_contact.carrier_id = contact['carrier_id']
            db_contact.notify = contact['notify']
        else:
            new_contact = Contact(name=remove_tags(contact['name']), email=remove_tags(contact['email']),
                                  cell_number=format_phone_number(contact['cell_number']),
                                  carrier_id=contact['carrier_id'], notify=contact['notify'])
            db.session.add(new_contact)
        db.session.commit()
    return redirect(url_for('contact.contacts'))

示例#12

0

显示文件

文件： interface.py 项目： gis3w/geoscopio_search

    def addWKTLayer(self, WKT, legendName, epsg, fields, originalcrs=True):
        """
        Build a Vector layer from WKT string and adds it to the map canvas
        """

        # strip html tags
        legendName = remove_tags(legendName)

        # build the geometry
        geom = QgsGeometry.fromWkt(WKT)
        if not isinstance(geom, QgsGeometry):
            self.setErrorMessages('addWKTLayer', 'Check WKT string, probably is not correct!')
            return False

        # trasform geometry id originalcrs is set to False
        if not originalcrs:
            currentCrs = self.mapCanvas.mapSettings().destinationCrs()
            trasform = QgsCoordinateTransform(QgsCoordinateReferenceSystem(int(epsg[5:])), currentCrs)
            geom.transform(trasform)
            epsg = str(currentCrs.authid())

        # check the type
        geomType = geom.type()
        if geomType in (QGis.UnknownGeometry, QGis.NoGeometry):
            self.setErrorMessages('addWKTLayer', 'Qgis said that your WKT is not a valid geometry or is a Unknown geometry')
            return False
        
        # just instance QgsRubberBand or a Vertex if is a polygon / line or a point
        # get the geometry by ogr
        # build fields parameter
        fieldsAttr = json.loads(fields) 
        
        fieldsAttrList = []
        for k, v in fieldsAttr.items():
            if type(v) == str or type(v) == unicode:
                fieldsAttrList.append(k + ':string')
            elif type(v) == float:
                fieldsAttrList.append(k + ':double')
            elif type(v) == int:
                fieldsAttrList.append(k + ':integer')
                
        uriVLayer = TYPE_MAP[geom.wkbType()] + '?crs=' + epsg + '&field=' + '&field='.join(fieldsAttrList)
        logQgisConsole(uriVLayer)
        VLayer = QgsVectorLayer(uriVLayer, legendName, "memory")
        pr = VLayer.dataProvider()
        seg = QgsFeature()
        layerFields = pr.fields()
        seg.setFields(layerFields)
        for k, v in fieldsAttr.items():
            seg.setAttribute(k, v)
        seg.setGeometry(geom)
        pr.addFeatures([seg])
        VLayer.updateExtents() 

        QgsMapLayerRegistry.instance().addMapLayers([VLayer])
        return VLayer.id()

示例#13

0

显示文件

文件： miniethnologue.py 项目： JakeJing/SeedLing

def check_lang(langcode, option):
  """ Queries SIL website for language status given the language code. """
  import urllib2, re
  from bs4 import BeautifulSoup as bs
  from utils import remove_tags
  url = "http://www-01.sil.org/iso639-3/documentation.asp?id="+langcode
  page = urllib2.urlopen(url).read().decode('utf8')
  if "The supplied value for the code parameter is not "+\
  "a valid three-letter identifier." in page:
    return None
  status = unicode([i for i in bs(page).\
            find_all('tr') if option+":" in i.text][0])
  status = re.findall(r'<td>.*?<\/td>', status)[0]
  return remove_tags(status).lower()

示例#14

0

显示文件

def build_search_db(entries):
    search = []
    for index, entry in enumerate(entries):
        item = {
            "id": index,
            "title": entry['title'],
            "date": entry['date'].strftime(PUBLISHED_DATE_FORMAT),
            "content": remove_tags(entry['html']),
            "url": entry['url']
        }
        search.append(item)
    search_json = json.dumps(search)
    path = join(INDEX_DIR, "search.json")
    write(path, search_json)

示例#15

0

显示文件

def get_odin_igts(ODINFILE=parentddir + '/data/odin/odin-full.tar'):
    """
  Extracts the examples from the ODIN igts and returns a defaultdict(list),
  where the keys are the lang iso codes and values are the examples.
  
  >>> igts = get_odin_igts()
  >>> for lang in igts:
  >>>  for igt in igts[lang]:
  >>>    print lang, igt
  """

    tar = tarfile.open(ODINFILE)
    docs = defaultdict(list)
    for infile in tar:
        if '.xml' in infile.name:  # there's a rogue file in the tar that is not xml.
            lang = infile.name[:-4].lower()
            ##print lang
            # Find the <igt>...</igt> in the xml.
            odinfile = tar.extractfile(infile).read()
            igts = bs(odinfile).findAll('igt')
            citations = bs(odinfile).findAll('citation')
            for igt, cite in zip(igts, citations):
                # Find the <example>...</example> in the igt.
                examples = bs(unicode(igt)).findAll('example')
                cite = remove_tags(unicode(cite)).strip(' &lt;/p&gt;')
                for eg in examples:
                    try:
                        # Only use triplets lines and assumes that
                        # line1: src, line2:eng, line3:gloss
                        src, eng, gloss = bs(unicode(eg)).findAll('line')
                        src, eng, gloss, cite = map(unicode,
                                                    [src, eng, gloss, cite])
                        docs[lang].append((src, eng, gloss, cite))
                        ##print src, eng, gloss, cite
                    except:
                        raise
                        print eg
    return docs

示例#16

0

显示文件

文件： odin.py 项目： susfert/sugali

def get_odin_igts(ODINFILE=parentddir + "/data/odin/odin-full.tar"):
    """
  Extracts the examples from the ODIN igts and returns a defaultdict(list),
  where the keys are the lang iso codes and values are the examples.
  
  >>> igts = get_odin_igts()
  >>> for lang in igts:
  >>>  for igt in igts[lang]:
  >>>    print lang, igt
  """

    tar = tarfile.open(ODINFILE)
    docs = defaultdict(list)
    for infile in tar:
        if ".xml" in infile.name:  # there's a rogue file in the tar that is not xml.
            lang = infile.name[:-4].lower()
            print lang
            # Find the <igt>...</igt> in the xml.
            odinfile = tar.extractfile(infile).read()
            igts = bs(odinfile).findAll("igt")
            citations = bs(odinfile).findAll("citation")
            for igt, cite in zip(igts, citations):
                # Find the <example>...</example> in the igt.
                examples = bs(unicode(igt)).findAll("example")
                cite = remove_tags(unicode(cite)).strip(" &lt;/p&gt;")
                for eg in examples:
                    try:
                        # Only use triplets lines and assumes that
                        # line1: src, line2:eng, line3:gloss
                        src, eng, gloss = bs(unicode(eg)).findAll("line")
                        src, eng, gloss, cite = map(unicode, [src, eng, gloss, cite])
                        docs[lang].append((src, eng, gloss, cite))
                        ##print src, eng, gloss, cite
                    except:
                        raise
                        print eg
    return docs

示例#17

0

显示文件

文件： swotsquestion.py 项目： gallen/myswots-py

 def question(self):
     return remove_tags(self._qDict["questionText"])

示例#18

0

显示文件

文件： swotsquestion.py 项目： gallen/myswots-py

 def options(self):
     return [remove_tags(qo) for qo in self._qDict["options"]]

示例#19

0

显示文件

文件： swotsquestion.py 项目： gallen/myswots-py

 def solution(self):
     self._getFeedback()
     return remove_tags(self._feedback["solution"]["solutionText"])

示例#20

0

显示文件

def pickle2plaintext(testing=False, option='cleanest'):
    """ Converted ODIN IGTs from the .pk file into tab-delimited plaintexts."""
    # Makes a temp output directory for the individual files.
    TEMPODIN_DIR = './tmpodin/'  # for saving the temp udhr files.
    if not os.path.exists(TEMPODIN_DIR):
        os.makedirs(TEMPODIN_DIR)

    for language, documents in sorted(load_odin_pickle()):
        tab_igts = []
        for d in documents:
            if d[0].strip() == "": continue
            src = remove_tags(d[0])
            # Removes heading bullets, e.g. (1)... | 1) | ( 12 ) | i. ... | A2. ...
            src = re.sub(r'^\(?\s?\w{1,5}\s*[):.]\s*', '', src)
            src = re.sub(r'^\(?\w{1,5}\s*[):.]\s*', '', src)
            src = re.sub(r'^\(?\w{1,5}\s*[):.]\s*', '', src)
            morphemes = src
            # Joins the morphemes up into words.
            words = re.sub(' *- *', '', src)

            if option == 'cleanest':  # Accepts only IGTs without punctuation.
                if src == '' or any(i for i in string.punctuation if i in src):
                    continue
            elif option == 'cleaner':  # Removes the example number at the end.
                patterns = [r"\(.{1,}\)", r"[\(\)]"]
                for pat in patterns:
                    src = re.sub(pat, '', src)
            else:  # Accepts IGTs as they are.
                if src == '':
                    continue

            # src, eng, gloss, cite = d[0], d[1], d[2], d[3]
            tab_igts.append([words, morphemes, remove_tags(d[1]), \
                  remove_tags(d[2]), d[3]])
        if len(tab_igts) > 0:
            with codecs.open(TEMPODIN_DIR + 'odin-' + language + '.txt', 'w',
                             'utf8') as fout:
                for igt in tab_igts:
                    print >> fout, "\t".join(igt)

        if testing:
            break

    if testing:
        # Compress the utf8 UDHR files into a single tarfile in the test dir.
        try:
            make_tarfile('test/odin-' + option + '.tar', TEMPODIN_DIR)
        except IOError:
            # if function is called within the sugarlike/src/universalcorpus dir
            # To move up directory to access sugarlike/data/ and sugarlike/test/.
            make_tarfile('../test/odin-' + option + '.tar', TEMPODIN_DIR)
    else:
        # Compresses the utf8 UDHR files into a single tarfile.
        try:
            make_tarfile('../data/odin/odin-' + option + '.tar', TEMPODIN_DIR)
        except IOError:
            # if function is called within the sugarlike/src/universalcorpus dir
            # To move up directory to access sugarlike/data/ and sugarlike/test/.
            make_tarfile('../data/odin/odin-' + option + '.tar', TEMPODIN_DIR)
    # Remove the udhr-utf8 directory.
    shutil.rmtree(TEMPODIN_DIR)