def get_seq(chr_no, start, size): #hits' is of the form "3:3676292:3678292" if (size < 0): size = -size #excludes the base at reg_seq_start hits = str(chr_no) + ":" + str(start - size) + ":" + str(start - 1) else: #includes the base at reg_seq_start hits = str(chr_no) + ":" + str(start) + ":" + str(start + size - 1) params = { 'program': 'returnFASTA', 'db': 'GENOME', 'dbid': '4', "hits": hits, "DBpath": "/DATA/PlantGDB/Index/Blast/OsGDB/OSgenome", "xGDB": "OsGDB" } r = rq.post('http://www.plantgdb.org/OsGDB/cgi-bin/formatReader.pl', data=params) html_resp = r.content bs = BSHTML(html_resp) fasta_string = bs.pre.contents[0].strip() return fasta_string
def _doc_read_file(self, relpath, op=False): abspath = os.path.join(self.path, relpath) page = open(abspath, 'r').read() page = type( 'Documentation', (object, ), { 'get_relative_source_path': (lambda x: x.save_as), 'content': page, 'title': BSHTML(page).find('title').getText(), 'url': relpath if op else os.path.dirname(relpath), 'save_as': relpath, 'template': 'documentation' })() self.add_source_path(page) return page
def get_decoded_email_body(def_self, incident_id, eml_filename, mail): attachments = [] urls = [] text = '' # Stores the email body text, HTML formatted, for the return value of this function. if mail.is_multipart(): for part in list(walk(mail)): try: if part is None: continue charset = part.get_content_charset() if (part.get('Content-Disposition') is not None) and part.get_filename( ) is not None: # Likely a file attachment if "attachment" in part.get( 'Content-Disposition').lower(): # File attachment try: filename = part.get_filename( ) # The name of the file content = part.get_payload( decode=True) # The content of the file text += '<br />[attachment: ' + filename + ']' # Insert found attachment into the body text # Here we temporarily store the attachment, and then post it to the incident as an attachment and artifact with tempfile.NamedTemporaryFile( delete=False) as temp_file: try: temp_file.write(content) temp_file.close() artifact_type_id = 16 # "Other File" artifact ID def_self.rest_client().post_artifact_file( '/incidents/{0}/artifacts/files'. format(incident_id), artifact_type_id, temp_file.name, description='Attachment from {0}'. format(eml_filename), value=filename) def_self.rest_client().post_attachment( '/incidents/{0}/attachments'.format( incident_id), temp_file.name, '[MALICIOUS] {0}'.format(filename)) attachments.append(filename) finally: os.unlink(temp_file.name) except: pass elif part.get_payload(decode=True) is None: continue elif part.get_content_charset() is None: # We cannot know the character set, so return decoded "something" ... text += unicode( part.get_payload(decode=True), errors='replace' ).encode('UTF-8', 'replace').strip( ) # Trying this - may decide to remove later. -JJF, 2/23/2019 #continue elif part.get_content_type() == 'text/plain': t = unicode(part.get_payload(decode=True), str(charset), 'replace').encode('UTF-8', 'replace').strip() text += '<br />'.join(t.splitlines()) # To HTML urls_temp = re.findall( WEB_URL_REGEX, text.strip()) # Find all URLs in body text for u in urls_temp: if u not in urls: urls.append( u) # If not already in urls list, add it elif part.get_content_type() == 'text/html': t = unicode(part.get_payload(decode=True), str(charset), 'replace').encode('UTF-8', 'replace').strip() text += str(t) skip_image_urls = [] urls_html_temp = re.findall(HTML_URL_REGEX, text.strip()) # Could also try: [a.get('href') for a in soup.find_all('a', href=True)] soup = BSHTML(text) images = soup.findAll( 'img' ) # Find img tag urls, to ensure we don't put image URLs into urls list for image in images: skip_image_urls.append(image['src']) for u in urls_html_temp: if (u not in urls) and (u not in skip_image_urls): urls.append( u ) # If not already in urls list and not an image, add it elif part.get_content_type( ) == 'text/enriched': # This has not been tested yet, no test cases available. t = unicode(part.get_payload(decode=True), str(charset), 'replace').encode('UTF-8', 'replace').strip() text += '<br />'.join(striprtf(t).splitlines()) # To HTML urls_temp = re.findall( WEB_URL_REGEX, text.strip()) # Find all URLs in body text for u in urls_temp: if u not in urls: urls.append( u) # If not already in urls list, add it except Exception as err: log.info( '[ERROR] Message body decoding failed at a part! Encountered: ' + str(err) ) # For debugging unexpected situations, function is robust as-is though if text is not None and text is not "": return [text.strip(), attachments, urls] else: return [ 'Unable to parse email body. Was it empty?', attachments, urls ] else: t = unicode(mail.get_payload(decode=True), mail.get_content_charset(), 'replace').encode('UTF-8', 'replace') text = '<br />'.join(t.splitlines()) # To HTML urls_temp = re.findall(WEB_URL_REGEX, text.strip()) # Find all URLs in body text for u in urls_temp: if u not in urls: urls.append(u) # If not already in urls list, add it return [text.strip(), attachments, urls]
def get_code(desc): bs = BSHTML(desc) return bs.code
# Modified on 2 July 2019 # This simple scripts reads all emoji counts from emojitracker and saves # them into a csv file. Simply save emojitracker.com as a html file and # pass the file name to this parser. # # How to run: python parse.py # # Requirements: pip install BeautifulSoup from BeautifulSoup import BeautifulSoup as BSHTML INPUT_FILE = 'emojitracker-sample.html' OUTPUT_FILE = 'output-sample.csv' INPUT_FILE = 'emojitracker-2-july-2019.html' OUTPUT_FILE = 'output.csv' f = open(OUTPUT_FILE,'w') f.write('unicode\tname\tcount\n') # write headers with open(INPUT_FILE) as texts: soup = BSHTML(texts) lis = soup.findAll('li', attrs = {'class' : 'emoji_char'}) for li in lis: emoji = li['id'].lower() name = li['data-title'].lower() count = li.find('span', attrs = {'class' : 'score'}).text f.write(emoji+'\t"'+name+'"\t'+count+'\n') # write to file f.close()