def handle_conversation_replied(ctx): user_id = ctx['data']['item']['user']['user_id'] user_id_parts = user_id.split('/') message = ctx['data']['item']['conversation_parts']['conversation_parts'][ 0] image_urls = [ m.group(1) for m in re.finditer(r'<\s*img\s+src="(http.+?)"\s*>', message['body']) ] if user_id_parts[0] == 'wechat': wechat_client, wechat_id = user_id_parts[1:] wechat_client_encoded = urlencode(wechat_client) for url in image_urls: wechat.send_friend_message(client=wechat_client_encoded, id=wechat_id, media_path=url) wechat.send_friend_message(client=wechat_client_encoded, id=wechat_id, content=remove_tags( message['body']).strip()) elif 'INTERCOM_BOT_USER_ID' in app.config \ and user_id == app.config['INTERCOM_BOT_USER_ID']: handle_admin_commands(app.config['INTERCOM_BOT_USER_ID'], remove_tags(message['body']).strip())
def entries_file(): # TODO: speed up this method collection = request.args.get('collection') databank_name = request.args.get('databank') comment_text = request.args.get('comment') # listing determines what is shown per entry(pdb ids, databank names, comments, file names, etc.) listing = request.args.get('listing') _log.info("request for entries file %s %s %s %s" %(collection, databank_name, comment_text, listing)) if not listing: return '' listing = listing.lower() entries = [] name="0" if databank_name and collection: entries = get_entries_from_collection(databank_name, collection) name = "%s%s" %(databank_name, collection) elif databank_name and comment_text: entries = get_entries_with_comment(databank_name, comment_text) name = "%s%s" %(databank_name, remove_tags(comment_text)) elif comment_text: entries = get_all_entries_with_comment(comment_text) name = remove_tags(comment_text) text = '' if listing == 'comments': d = {} for entry in entries: if 'comment' in entry: c = entry ['comment'] if c not in d: d [c] = '' d [c] += '%s,%s\n' %(entry['databank_name'], entry['pdbid']) for comment in d: text += comment + ":\n" + d [comment] else: for entry in entries: if listing == 'pdbids': text += entry ['pdbid'] + '\n' elif listing == 'entries': text += '%s,%s\n' %(entry['databank_name'], entry ['pdbid']) elif listing == 'files' and 'filepath' in entry: text += '%s,%s,%s\n' %(entry['databank_name'], entry ['pdbid'], entry ['filepath']) response = Response(text, mimetype='text/plain') response.headers["Content-Disposition"] = "attachment; filename=%s" %('%s_%s' %(name, listing)) return response
def test_instances(self): """ Returns the test instances from SemEval2007 Coarse-grain WSD task. >>> coarse_wsd = SemEval2007_Coarse_WSD() >>> inst2ans = coarse_wsd.get_answers() >>> for inst in inst2ans: ... print inst, inst2ans[inst] ... break d004.s073.t013 answer(sensekey=[u'pointer%1:06:01::', u'pointer%1:06:00::', u'pointer%1:10:00::'], lemma=u'pointer', pos=u'n') """ Instance = namedtuple('instance', 'id, lemma, word') test_file = io.open(self.test_file, 'r').read() inst2ans = self.get_answers() for text in bsoup(test_file).findAll('text'): textid = text['id'] document = " ".join([remove_tags(i) for i in str(text).split('\n') if remove_tags(i)]) for sent in text.findAll('sentence'): sentence = " ".join([remove_tags(i) for i in str(sent).split('\n') if remove_tags(i)]) for instance in sent.findAll('instance'): instid = instance['id'] lemma = instance['lemma'] word = instance.text inst = Instance(instid, lemma, word) yield inst, inst2ans[instid], unicode(sentence), unicode(document)
def pickle2plaintext(testing=False, option="cleanest"): """ Converted ODIN IGTs from the .pk file into tab-delimited plaintexts.""" # Makes a temp output directory for the individual files. TEMPODIN_DIR = "./tmpodin/" # for saving the temp udhr files. if not os.path.exists(TEMPODIN_DIR): os.makedirs(TEMPODIN_DIR) for language, documents in sorted(load_odin_pickle()): tab_igts = [] for d in documents: if d[0].strip() == "": continue src = remove_tags(d[0]) # Removes heading bullets, e.g. (1)... | 1) | ( 12 ) | i. ... | A2. ... src = re.sub(r"^\(?\s?\w{1,5}\s*[):.]\s*", "", src) src = re.sub(r"^\(?\w{1,5}\s*[):.]\s*", "", src) src = re.sub(r"^\(?\w{1,5}\s*[):.]\s*", "", src) morphemes = src # Joins the morphemes up into words. words = re.sub(" *- *", "", src) if option == "cleanest": # Accepts only IGTs without punctuation. if src == "" or any(i for i in string.punctuation if i in src): continue elif option == "cleaner": # Removes the example number at the end. patterns = [r"\(.{1,}\)", r"[\(\)]"] for pat in patterns: src = re.sub(pat, "", src) else: # Accepts IGTs as they are. if src == "": continue # src, eng, gloss, cite = d[0], d[1], d[2], d[3] tab_igts.append([words, morphemes, remove_tags(d[1]), remove_tags(d[2]), d[3]]) if len(tab_igts) > 0: with codecs.open(TEMPODIN_DIR + "odin-" + language + ".txt", "w", "utf8") as fout: for igt in tab_igts: print >> fout, "\t".join(igt) if testing: break if testing: # Compress the utf8 UDHR files into a single tarfile in the test dir. try: make_tarfile("test/odin-" + option + ".tar", TEMPODIN_DIR) except IOError: # if function is called within the sugarlike/src/universalcorpus dir # To move up directory to access sugarlike/data/ and sugarlike/test/. make_tarfile("../test/odin-" + option + ".tar", TEMPODIN_DIR) else: # Compresses the utf8 UDHR files into a single tarfile. try: make_tarfile("../data/odin/odin-" + option + ".tar", TEMPODIN_DIR) except IOError: # if function is called within the sugarlike/src/universalcorpus dir # To move up directory to access sugarlike/data/ and sugarlike/test/. make_tarfile("../data/odin/odin-" + option + ".tar", TEMPODIN_DIR) # Remove the udhr-utf8 directory. shutil.rmtree(TEMPODIN_DIR)
def __init__(self, swots, testId, questionId, answer, timeSpent): feedback = swots.postJson("/quiz/users/" + str(swots.userId) + "/tests/" + str(testId) + "/feedback/" + str(questionId), {"answer": answer, "timeSpent": timeSpent}) self.answer = feedback["answer"] self.solution = remove_tags(feedback["solution"]["solutionText"]).strip() self.answerText = remove_tags(feedback["options"][self.answer - 1]) self.status = feedback["userStats"]["status"]
def entries_file(): # TODO: speed up this method collection = request.args.get('collection') databank_name = request.args.get('databank') comment_text = request.args.get('comment') # listing determines what is shown per entry(pdb ids, databank names, # comments, file names, etc.) listing = request.args.get('listing') _log.info("request for entries file %s %s %s %s" % ( collection, databank_name, comment_text, listing)) if not listing: return '' listing = listing.lower() entries = [] name = "0" if databank_name and collection: entries = get_entries_from_collection(databank_name, collection) name = "%s%s" % (databank_name, collection) elif databank_name and comment_text: entries = get_entries_with_comment(databank_name, comment_text) name = "%s%s" % (databank_name, remove_tags(comment_text)) elif comment_text: entries = get_all_entries_with_comment(comment_text) name = remove_tags(comment_text) text = '' if listing == 'comments': d = {} for entry in entries: if 'comment' in entry: c = entry['comment'] if c not in d: d[c] = '' d[c] += '%s,%s\n' % (entry['databank_name'], entry['pdb_id']) for comment in d: text += comment + ":\n" + d[comment] else: for entry in entries: if listing == 'pdb_ids': text += entry['pdb_id'] + '\n' elif listing == 'entries': text += '%s,%s\n' % (entry['databank_name'], entry['pdb_id']) elif listing == 'files' and 'filepath' in entry: text += '%s,%s,%s\n' % (entry['databank_name'], entry['pdb_id'], entry['filepath']) response = Response(text, mimetype='text/plain') header_val = "attachment; filename=%s_%s" % (name, listing) response.headers["Content-Disposition"] = header_val return response
def parseListToGamesInBlock(self, mix): teams_ls = [] time = "" chan = "" listOfGames = [] for item in mix: string_comp = str(item) string_comp = string_comp.replace(u'\xa0', ' ') validGame = 0 if "Week " in string_comp: print("Week") elif " vs. " in string_comp: teams_ls = re.split(' (vs). ', string_comp) #print("VS: " + string_comp) #teams_ls[2] = teams_ls[2].split('(')[0] #print("t1: " + teams_ls[0] + " t2: " + teams_ls[2]) validGame = 0 elif (" at " in string_comp) and (not "Updated" in string_comp): tagless_team_line = utils.remove_tags( str(item.contents[self.CONST_GAME_INDEX])) teams_ls = re.split(' (at) ', tagless_team_line) #print("AT: " ) #print("AT CONTENTS: " + str(item.contents[1])) #print("t0 " + teams_ls[0] + " t1: " + teams_ls[1]+ " t2: " + teams_ls[2]) # Time tagless_time_line = utils.remove_tags( str(item.contents[self.CONST_TIME_INDEX])) time = tagless_time_line #print("Time: " + time) # Channel tagless_chan_line = utils.remove_tags( str(item.contents[self.CONST_CHANNEL_INDEX])) chan = tagless_chan_line #print("Chan: " + chan) validGame = 1 else: print("Error?") #print("Possible error parsing game: " + string_comp) if validGame: temp_game_info = GameInfo(teams_ls[0], teams_ls[2], time, chan) print("Adding GI: " + teams_ls[0] + " " + teams_ls[2] + " " + time + " " + chan) print("Added: " + temp_game_info.getTeam1() + " " + temp_game_info.getTeam2() + " " + temp_game_info.getTimeStr() + " " + temp_game_info.getChannel()) listOfGames.append( GameInfo(teams_ls[0], teams_ls[2], time, chan)) return listOfGames
def yield_sentences(self): test_file = io.open(self.test_file, 'r').read() inst2ans = self.get_answers() for text in bsoup(test_file).findAll('text'): if not text: continue textid = text['id'] context_doc = " ".join([remove_tags(i) for i in str(text).split('\n') if remove_tags(i)]) for sent in text.findAll('sentence'): context_sent = " ".join([remove_tags(i) for i in str(sent).split('\n') if remove_tags(i)]) yield sent, context_sent, context_doc, inst2ans, textid
def _single_convert(self, input_file_object): if input_file_object: input_file_path = input_file_object.get_input_file_path() output_file_name = rename_filename_with_extension( os.path.basename(input_file_path), self.final_format) intermediate_filename = str(time.time()).replace('.', '') + '.html' output_file_path = os.path.join(self.tmp_dir, output_file_name) intermediate_path = os.path.join( self.tmp_dir, intermediate_filename) with codecs.open(input_file_path, "r", "utf-8") as f: cleaned_content = remove_tags(f.read()) with open(intermediate_path, 'w') as w: w.write(cleaned_content) converter = CONVERTER_LOCATION.format( input_file_path=intermediate_path, output_file_path=output_file_path) self.execute(converter) if os.path.isfile(output_file_path): return output_file_path else: self.handle_failed_conversion(input_file_object) log.error('Conversion failed from HTML => PDF') return None
def create(): contacts = json.loads(request.form.get('contacts')) for contact in contacts: db_contact = Contact.query.filter_by(id=contact['id']).first() if db_contact: db_contact.name = remove_tags(contact['name']) db_contact.email = remove_tags(contact['email']) db_contact.cell_number = format_phone_number(contact['cell_number']) db_contact.carrier_id = contact['carrier_id'] db_contact.notify = contact['notify'] else: new_contact = Contact(name=remove_tags(contact['name']), email=remove_tags(contact['email']), cell_number=format_phone_number(contact['cell_number']), carrier_id=contact['carrier_id'], notify=contact['notify']) db.session.add(new_contact) db.session.commit() return redirect(url_for('contact.contacts'))
def addWKTLayer(self, WKT, legendName, epsg, fields, originalcrs=True): """ Build a Vector layer from WKT string and adds it to the map canvas """ # strip html tags legendName = remove_tags(legendName) # build the geometry geom = QgsGeometry.fromWkt(WKT) if not isinstance(geom, QgsGeometry): self.setErrorMessages('addWKTLayer', 'Check WKT string, probably is not correct!') return False # trasform geometry id originalcrs is set to False if not originalcrs: currentCrs = self.mapCanvas.mapSettings().destinationCrs() trasform = QgsCoordinateTransform(QgsCoordinateReferenceSystem(int(epsg[5:])), currentCrs) geom.transform(trasform) epsg = str(currentCrs.authid()) # check the type geomType = geom.type() if geomType in (QGis.UnknownGeometry, QGis.NoGeometry): self.setErrorMessages('addWKTLayer', 'Qgis said that your WKT is not a valid geometry or is a Unknown geometry') return False # just instance QgsRubberBand or a Vertex if is a polygon / line or a point # get the geometry by ogr # build fields parameter fieldsAttr = json.loads(fields) fieldsAttrList = [] for k, v in fieldsAttr.items(): if type(v) == str or type(v) == unicode: fieldsAttrList.append(k + ':string') elif type(v) == float: fieldsAttrList.append(k + ':double') elif type(v) == int: fieldsAttrList.append(k + ':integer') uriVLayer = TYPE_MAP[geom.wkbType()] + '?crs=' + epsg + '&field=' + '&field='.join(fieldsAttrList) logQgisConsole(uriVLayer) VLayer = QgsVectorLayer(uriVLayer, legendName, "memory") pr = VLayer.dataProvider() seg = QgsFeature() layerFields = pr.fields() seg.setFields(layerFields) for k, v in fieldsAttr.items(): seg.setAttribute(k, v) seg.setGeometry(geom) pr.addFeatures([seg]) VLayer.updateExtents() QgsMapLayerRegistry.instance().addMapLayers([VLayer]) return VLayer.id()
def check_lang(langcode, option): """ Queries SIL website for language status given the language code. """ import urllib2, re from bs4 import BeautifulSoup as bs from utils import remove_tags url = "http://www-01.sil.org/iso639-3/documentation.asp?id="+langcode page = urllib2.urlopen(url).read().decode('utf8') if "The supplied value for the code parameter is not "+\ "a valid three-letter identifier." in page: return None status = unicode([i for i in bs(page).\ find_all('tr') if option+":" in i.text][0]) status = re.findall(r'<td>.*?<\/td>', status)[0] return remove_tags(status).lower()
def build_search_db(entries): search = [] for index, entry in enumerate(entries): item = { "id": index, "title": entry['title'], "date": entry['date'].strftime(PUBLISHED_DATE_FORMAT), "content": remove_tags(entry['html']), "url": entry['url'] } search.append(item) search_json = json.dumps(search) path = join(INDEX_DIR, "search.json") write(path, search_json)
def get_odin_igts(ODINFILE=parentddir + '/data/odin/odin-full.tar'): """ Extracts the examples from the ODIN igts and returns a defaultdict(list), where the keys are the lang iso codes and values are the examples. >>> igts = get_odin_igts() >>> for lang in igts: >>> for igt in igts[lang]: >>> print lang, igt """ tar = tarfile.open(ODINFILE) docs = defaultdict(list) for infile in tar: if '.xml' in infile.name: # there's a rogue file in the tar that is not xml. lang = infile.name[:-4].lower() ##print lang # Find the <igt>...</igt> in the xml. odinfile = tar.extractfile(infile).read() igts = bs(odinfile).findAll('igt') citations = bs(odinfile).findAll('citation') for igt, cite in zip(igts, citations): # Find the <example>...</example> in the igt. examples = bs(unicode(igt)).findAll('example') cite = remove_tags(unicode(cite)).strip(' </p>') for eg in examples: try: # Only use triplets lines and assumes that # line1: src, line2:eng, line3:gloss src, eng, gloss = bs(unicode(eg)).findAll('line') src, eng, gloss, cite = map(unicode, [src, eng, gloss, cite]) docs[lang].append((src, eng, gloss, cite)) ##print src, eng, gloss, cite except: raise print eg return docs
def get_odin_igts(ODINFILE=parentddir + "/data/odin/odin-full.tar"): """ Extracts the examples from the ODIN igts and returns a defaultdict(list), where the keys are the lang iso codes and values are the examples. >>> igts = get_odin_igts() >>> for lang in igts: >>> for igt in igts[lang]: >>> print lang, igt """ tar = tarfile.open(ODINFILE) docs = defaultdict(list) for infile in tar: if ".xml" in infile.name: # there's a rogue file in the tar that is not xml. lang = infile.name[:-4].lower() print lang # Find the <igt>...</igt> in the xml. odinfile = tar.extractfile(infile).read() igts = bs(odinfile).findAll("igt") citations = bs(odinfile).findAll("citation") for igt, cite in zip(igts, citations): # Find the <example>...</example> in the igt. examples = bs(unicode(igt)).findAll("example") cite = remove_tags(unicode(cite)).strip(" </p>") for eg in examples: try: # Only use triplets lines and assumes that # line1: src, line2:eng, line3:gloss src, eng, gloss = bs(unicode(eg)).findAll("line") src, eng, gloss, cite = map(unicode, [src, eng, gloss, cite]) docs[lang].append((src, eng, gloss, cite)) ##print src, eng, gloss, cite except: raise print eg return docs
def question(self): return remove_tags(self._qDict["questionText"])
def options(self): return [remove_tags(qo) for qo in self._qDict["options"]]
def solution(self): self._getFeedback() return remove_tags(self._feedback["solution"]["solutionText"])
def pickle2plaintext(testing=False, option='cleanest'): """ Converted ODIN IGTs from the .pk file into tab-delimited plaintexts.""" # Makes a temp output directory for the individual files. TEMPODIN_DIR = './tmpodin/' # for saving the temp udhr files. if not os.path.exists(TEMPODIN_DIR): os.makedirs(TEMPODIN_DIR) for language, documents in sorted(load_odin_pickle()): tab_igts = [] for d in documents: if d[0].strip() == "": continue src = remove_tags(d[0]) # Removes heading bullets, e.g. (1)... | 1) | ( 12 ) | i. ... | A2. ... src = re.sub(r'^\(?\s?\w{1,5}\s*[):.]\s*', '', src) src = re.sub(r'^\(?\w{1,5}\s*[):.]\s*', '', src) src = re.sub(r'^\(?\w{1,5}\s*[):.]\s*', '', src) morphemes = src # Joins the morphemes up into words. words = re.sub(' *- *', '', src) if option == 'cleanest': # Accepts only IGTs without punctuation. if src == '' or any(i for i in string.punctuation if i in src): continue elif option == 'cleaner': # Removes the example number at the end. patterns = [r"\(.{1,}\)", r"[\(\)]"] for pat in patterns: src = re.sub(pat, '', src) else: # Accepts IGTs as they are. if src == '': continue # src, eng, gloss, cite = d[0], d[1], d[2], d[3] tab_igts.append([words, morphemes, remove_tags(d[1]), \ remove_tags(d[2]), d[3]]) if len(tab_igts) > 0: with codecs.open(TEMPODIN_DIR + 'odin-' + language + '.txt', 'w', 'utf8') as fout: for igt in tab_igts: print >> fout, "\t".join(igt) if testing: break if testing: # Compress the utf8 UDHR files into a single tarfile in the test dir. try: make_tarfile('test/odin-' + option + '.tar', TEMPODIN_DIR) except IOError: # if function is called within the sugarlike/src/universalcorpus dir # To move up directory to access sugarlike/data/ and sugarlike/test/. make_tarfile('../test/odin-' + option + '.tar', TEMPODIN_DIR) else: # Compresses the utf8 UDHR files into a single tarfile. try: make_tarfile('../data/odin/odin-' + option + '.tar', TEMPODIN_DIR) except IOError: # if function is called within the sugarlike/src/universalcorpus dir # To move up directory to access sugarlike/data/ and sugarlike/test/. make_tarfile('../data/odin/odin-' + option + '.tar', TEMPODIN_DIR) # Remove the udhr-utf8 directory. shutil.rmtree(TEMPODIN_DIR)