def _get_doc_info(self, obj): d = self._get_general_info(obj) fields = {} fields['description'] = obj.description fields['text'] = dehtml(obj.getText()) d['fields'] = fields return d
def raw_message_to_obj(response): global service obj = collections.OrderedDict() # print(response.keys()) # print(response['payload'].keys()) # print(response['payload']['headers']) fields = ['Subject', 'Date', 'From', 'To', 'Cc', 'Message-ID'] try: ## THIS IS WHERE THINGS HAPPEN for f in fields[:1]: v = [x['value'] for x in response['payload']['headers'] if x['name'] == f] obj[f] = ''.join(v) #if v is empty array, resolves to empty string obj['snippet'] = dehtml.dehtml(response['snippet']) # we do this because obj is an ordered dict and we want subject, then snippet for f in fields[1:]: v = [x['value'] for x in response['payload']['headers'] if x['name'] == f] obj[f] = ''.join(v) #if v is empty array, resolves to empty string message = parse_multipart_message(response) #UNCOMMENT THIS IF NECESSARY obj['message'] = message except Exception as error: print('An Error occurred: %s' % error) return obj
def get(self): format = self.request.get("format") hook = db.GqlQuery(self.__queryString()) #find a way to redo until data is returned while(hook.count() == 0): hook = db.GqlQuery(self.__queryString()) soup = BeautifulSoup.BeautifulSoup(hook[0].text.replace(";", ",")) ash = soup.findAll("a") ashlinks = [] for a in ash: metadict = {'metaurl' : "http://en.wikipedia.org"+a["href"], 'metatext' : a.text } ashlinks.append(metadict) tex = dehtml.dehtml(hook[0].text) tex = tex.replace("... that ","",1).replace(';',',').rstrip("?") texlt = tex.split(" ",1) tex = texlt[0].capitalize()+" "+texlt[1] responseData = {"response": [{"hook":{ "title" : unquote(hook[0].link.replace(";", ",")), "text" : tex, "metadata": ashlinks}}]} if format == "json": self.ReturnJSON(responseData) elif format == "xml": self.ReturnXML(responseData) else: self.response.out.write("Incompatible format or No format specified!")
def fill_content(self): assert (self.kind) plain_texts = [] word_texts = [] html_texts = [] has_attachment = False for part in self.envelope_object.walk(): if part.get_content_type() == 'message/delivery-status': self.save() raise Exception('Bounce email.'.format(self)) elif not part.is_multipart(): if part.get_content_type() == 'text/plain': charset = part.get_content_charset() payload = part.get_payload(decode=True) if payload != '': plain_texts.append(payload.decode(charset)) elif part.get_content_type() == 'text/html': charset = part.get_content_charset() payload = part.get_payload(decode=True) if payload != '': html_texts.append(dehtml(payload.decode(charset))) elif self.kind == 'response': # Only accept attachments from representatives. if part.get_content_type() == 'application/msword': word_texts.append( antiword.antiword_string( part.get_payload(decode=True)).replace( '[pic]', '').replace('|', '')) if part.get_content_type() in ATTACHMENT_MIMETYPES: has_attachment = True attachment = Attachment( mimetype=part.get_content_type(), original_filename=part.get_filename(), message=self, ) attachment.set_content(part.get_payload(decode=True)) attachment.save() self.parent.is_locked = True self.parent.save() else: logger.warning( u'Skipping attachment {} ({}) in {} {}'.format( part.get_filename(), part.get_content_type(), self.kind, self.id, )) if not (plain_texts or html_texts or word_texts or has_attachment): raise Exception("Couldn't extract any content") body_text = '\n\n***\n\n'.join((plain_texts or html_texts) + word_texts) self.body_text = utils.remove_consequentive_empty_lines( utils.remove_reply_email(body_text))
def setTitle(self, fulltext=None): if fulltext: pass else: try: with open(self.path, 'r') as f: fulltext = f.read() except Exception as e: print(str(e)) html = False try: html = (fulltext.split('\n', 1)[0].split()[0] == '<!DOCTYPE') except: pass if html: fulltext = dehtml.dehtml(fulltext) self.title = fulltext.strip().split('\n', 1)[0][:MAX_TITLE_LEN] if not self.title: self.title = "Untitled" self.setText(self.title) return self.title
def process_event(self, event): self.logger.debug("Received event: %r", event) action = event['action'] if action == Chatter.Actions.GET_READY: self.cid = event['cid'] self.send_data(action=Chatter.Actions.SET_READY) elif action == Chatter.Actions.START_CHAT: self.connected = True self.send_unsent_messages() self.on_start_chat() elif action == Chatter.Actions.STOP_CHAT: self.disconnected = True self.on_stop_chat() elif action in (Chatter.Actions.START_TYPING, Chatter.Actions.STOP_TYPING): self.on_typing(started=(action == Chatter.Actions.START_TYPING)) elif action == Chatter.Actions.NEW_MESSAGE: if event['user'] != Chatter.USER_ME: self.on_message(dehtml.dehtml(event['message'])) elif action == Chatter.Actions.PING: self.on_ping() else: self.logger.error("Unknown event action: %r", event)
def get_all_threads(service, querystring): all_threads = [] try: thread_count = 0 start = True while start or 'nextPageToken' in response: if start: page_token = None start = False else: page_token = response['nextPageToken'] response = service.users().threads().list(userId='me', pageToken=page_token, q=querystring).execute() if 'threads' in response: thread_count += len(response['threads']) print (" == Loading ", thread_count, "threads") for thread in response['threads']: thread['snippet'] = dehtml.dehtml(thread['snippet']) print(thread) all_threads.append(thread) except errors.HttpError as error: print('An HTTPError occurred: %s' % error) return all_threads
def tfidfScores(faceUrls, noFaceUrls, faceSrcs, noFaceSrcs, sourceContent): print "srcs: " + str(noFaceSrcs) print "getting tfidf scores" sortedSrcs = [] if faceUrls is not None and faceSrcs is not None: faceDocuments = [] for i in range(len(faceUrls)): try: # get html from search result url response = requests.get(faceUrls[i]) #don't consider url if we don't get a good response if response.status_code == 200 and response.text != '': #try to decode encoding = response.encoding text = response.content.decode(encoding) #clean the javascript and css from the html file cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaned = cleaner.clean_html(text) #try to extract only the text from the remaining html try: parsed = (dehtml.dehtml(cleaned)) except UnicodeDecodeError: print "UnicodeDecodeError" # discard this url continue #lowercase it and remove punctuation lowers = parsed.lower() if type(lowers) is unicode: ascii = unicodedata.normalize("NFKD", lowers).encode( "ascii", "ignore") noPunct = ascii.translate(None, string.punctuation) elif type(lowers) is str: noPunct = lowers.translate(None, string.punctuation) faceDocuments.append(noPunct) except: pass #lowercase the source content (which has already had punctuation removed) lowers = sourceContent.lower() faceDocuments.insert(0, lowers) #tfidf on the documents (search results along with source content) tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words="english") faceTfs = tfidf.fit_transform(faceDocuments) #cosine similarity faceSimilarity = faceTfs * faceTfs.T #convert 0th row into an array (document similarities to the source content) similarities = [] print("building similarities") print("face similarity length: %d" % faceSimilarity.get_shape()[1]) for i in range(faceSimilarity.get_shape()[1]): similarities.append(faceSimilarity[0, i]) #sort the sources by decreasing cosine similarity indices = [ i[0] for i in sorted( enumerate(similarities), key=lambda x: x[1], reverse=True) ] for i in range(len(indices)): if len(faceSrcs) > indices[i]: sortedSrcs.append(faceSrcs[indices[i]]) #same as above but for no-face-detection urls noFaceDocuments = [] for i in range(len(noFaceUrls)): try: response = requests.get(noFaceUrls[i]) if response.status_code == 200 and response.text != '': encoding = response.encoding text = response.content.decode(encoding) cleaner = Cleaner() cleaner.javascript = True cleaner.style = True cleaned = cleaner.clean_html(response.content) try: parsed = (dehtml.dehtml(cleaned)) except UnicodeDecodeError: print "UnicodeDecodeError" continue lowers = parsed.lower() if type(lowers) is unicode: ascii = unicodedata.normalize("NFKD", lowers).encode( "ascii", "ignore") noPunct = ascii.translate(None, string.punctuation) elif type(lowers) is str: noPunct = lowers.translate(None, string.punctuation) noFaceDocuments.append(noPunct) except: pass lowers = sourceContent.lower() noFaceDocuments.insert(0, lowers) tfidf = TfidfVectorizer(tokenizer=tokenize, stop_words="english") noFaceTfs = tfidf.fit_transform(noFaceDocuments) noFaceSimilarity = noFaceTfs * noFaceTfs.T similarities = [] print("building similarities") print("face similarity length: %d" % noFaceSimilarity.get_shape()[1]) for i in range(noFaceSimilarity.get_shape()[1]): similarities.append(noFaceSimilarity[0, i]) indices = [ i[0] for i in sorted( enumerate(similarities), key=lambda x: x[1], reverse=True) ] for i in range(len(indices)): if len(noFaceSrcs) > indices[i]: sortedSrcs.append(noFaceSrcs[indices[i]]) #end tfidf print "done doing tfidf" print "sortedSrcs: " + str(sortedSrcs) #return image sources sorted by url text cosine similarity to source content return sortedSrcs
def eliminarTagsHTML(pDocumento): return dehtml.dehtml(pDocumento)
while input != ';': i_list = input.split(', ') words += ([i_list[0], i_list[1]],) input = raw_input("Another word? ; to quit\n") scores = {w: float(n) for w, n in words} total = 0 output = open("results.txt", "w") results = {} for dirs, subdirs, files in os.walk(root): for f in files: if f.endswith('.rtf'): doc = Rtf15Reader.read(open(f, "rb")) total = 0 text = dehtml(XHTMLWriter.write(doc).read().lower().split()) for word in text: word = re.sub('\W+', '', word) total += scores.get(word, 0) results[f] = total for key, value in sorted(results.items()): output.write(key + " " + str(value) + "\n") print "Finished! Check results.txt" raw_input("\nPress enter to close.")