def get_unicode(file_path): with open(file_path, 'rb') as f: detection = chardet.detect(f.read()) enc = detection["encoding"] if detection["encoding"] == "ascii": with open(file_path, encoding="ascii") as f: data = f.read() elif detection["encoding"] == "ISO-8859-9": with open(file_path, encoding="utf-8") as f: enc = "utf-8" data = f.read() else: try: # Try to open as non unicode file with open(file_path, encoding=detection["encoding"]) as f: data = f.read() except Exception as e: raise ValueError(f"Cannot return dictionary from empty or invalid csv file {file_path} due to {e}") if not data: raise ValueError(f"Cannot return dictionary from empty or invalid csv file {file_path}") return UnicodeDammit(data).unicode_markup, enc
def parse_rsc_html(htmlstring): """Messy RSC HTML needs this special parser to fix problems before creating selector.""" converted = UnicodeDammit(htmlstring) if not converted.unicode_markup: raise UnicodeDecodeError('Failed to detect encoding, tried [%s]') root = fromstring(htmlstring, parser=HTMLParser(recover=True, encoding=converted.original_encoding)) # Add p.otherpara tags around orphan text newp = None for child in root.get_element_by_id('wrapper'): if newp is not None: if child.tag in BLOCK_ELEMENTS or child.get( 'id', '').startswith('sect') or child.getnext() is None: child.addprevious(newp) newp = None else: newp.append(child) if newp is None and child.tag in BLOCK_ELEMENTS and child.tail and child.tail.strip( ): newp = Element('p', **{'class': 'otherpara'}) newp.text = child.tail child.tail = '' return root
def main (args): """ main functions """ logger = logging.getLogger(sys._getframe().f_code.co_name) path_source = os.path.realpath(args.json[0]) fields = [] files = 0 for dir_name, sub_dir_list, file_list in os.walk(path_source): this_dir = os.path.basename(dir_name) try: dirname = unicode(this_dir) except UnicodeDecodeError: try: dirname = UnicodeDammit(this_dir).unicode_markup except UnicodeDecodeError: logger.warning('this directory name is unspeakable evil') dirname = u'[[[EVIL]]]' for file_name_json in file_list: files += 1 with open(os.path.join(dir_name, file_name_json), 'r') as file_json: resource = json.load(file_json) for field in resource.keys(): if field not in fields: fields.append(field) pprint(resource) del resource if files % 250 == 0: logger.debug(u'parsed {0} files: {1} fields at {2}'.format(files, len(fields), dirname)) for field in sorted(fields): print (field)
def get_proxies(n=5): """Read some notoriously known sites and extract some public proxies. Scrapes - http://www.samair.ru/proxy/ The quality of these proxies is probably not worth to be mentioned, but it's nice to test the lack of quality and the behaviour of GoogleScraper. """ r = requests.get('http://www.samair.ru/proxy/') # Try to parse the google HTML result using lxml try: doc = UnicodeDammit(r.text, is_html=True) parser = lxml.html.HTMLParser(encoding=doc.declared_html_encoding) dom = lxml.html.document_fromstring(r.text, parser=parser) dom.resolve_base_href() except Exception as e: print('Some error occurred while lxml tried to parse: {}'.format(e)) table = dom.xpath('//table[@id=\'proxylist\']')[0] for row in table.findall('tr'): print(row.xpath('//td[1]')[0].text_content()) return GoogleScraper.Proxy()
def unicode_dammit_example(): # Install the 'chardet' or 'cchardet' Python libraries for better guesses ### Take a string with unknown encoding and make the string Unicode weirdass_string = "Sacr\xc3\xa9 bleu!" dammit = UnicodeDammit(weirdass_string) print "Original Word with weird encoding:", weirdass_string print "Dammit Print:", (dammit.unicode_markup) print "Dammit Type:", (dammit.original_encoding) ### Take a doc with mostly UTF-8 encoding (and misc encodings due to mult # data sources) and convert to UTF-8 Unicode with .Dammit.detwingle() snowmen = (u"\N{SNOWMAN}" * 3) quote = ( u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}" ) doc = snowmen.encode("utf8") + quote.encode("windows-1252") # So now we have one doc with two encodings in it, printing is a mess #print "Weird Decoding doc with utf8:", doc # messed up, won't print #print (doc.decode("windows-1252")) # So messed up it doesn't even print # Decode using UnicodeDammit.detwingle() converts the string to pure UTF-8 new_doc = UnicodeDammit.detwingle(doc) print new_doc.decode("utf8")
def get_fileencoding(filename, default=None, detail=None): encoding = default skip_bytes = 0 if os.path.isfile(filename): f = __builtin__.open(filename, "rb") try: s = f.read(2) """ ANSI: 无格式定义; Unicode: 前两个字节为FFFE; Unicode big endian: 前两字节为FEFF; UTF-8 with BOM: 前三字节为EFBBBF; """ if s == chr(0xff) + chr(0xfe): encoding = "utf_16_le" skip_bytes = 2 elif s == chr(0xfe) + chr(0xff): encoding = "utf_16_be" skip_bytes = 2 elif s == chr(0xef) + chr(0xbb): encoding = "utf-8-sig" skip_bytes = 3 except: pass if not encoding: # 使用BeautifulSoup的编码识别功能 f.seek(0) line = f.readline() dammit = UnicodeDammit(line) # 注意,这种方法有时获取到的编码是'windows-1252'(拉丁字符集的一种),因而不可靠。 encoding = dammit.original_encoding f.close() if isinstance(detail, dict): detail["encoding"] = encoding detail["skip_bytes"] = skip_bytes return encoding
def spiderImage(url): global urls global count req = urllib.request.Request(url, headers=header) #伪装成浏览器 data = urllib.request.urlopen(req) #访问网站 data = data.read() #读取网站内容 dammit = UnicodeDammit(data, ['utf-8', 'gbk']) data = dammit.unicode_markup soup = BeautifulSoup(data, "html.parser") imgs = soup.select("img") for img in imgs: try: src = img["src"] url = urllib.parse.urljoin(start_url, src) if url not in urls: urls.append(url) print(url) T = threading.Thread(target=download, args=[url,count]) T.setDaemon(False) T.start() threads.append(T) count = count + 1 except Exception as err: print(err)
def cleanse_href(href_str, base_url): """ Function to sort out the different href parsing methods and generate a meaningful URL's to follow """ ret_val = True try: # getting rid of empties and white spaces href_str = href_str.strip() except AttributeError: ret_val = False # getting rid of single digit , typically # hrefs if ret_val and len(href_str) > 1: ret_val = href_str else: ret_val = False # converting to unicode if ret_val: href_str_unicode = UnicodeDammit(href_str) href_str = (href_str_unicode.unicode_markup) # domain specific if ret_val and base_url == "http://www.irishtimes.com": # irish times puts a counter or a version number at the end # of their article pages, like 1.255698, so quick regexp # also, putting back the top domain to deliver full URL for irish times if re.search(r'\.[0-9]{3,5}', href_str): ret_val = TOP_DOMAIN + href_str else: ret_val = False return ret_val
def parse(self, response): """ default parse method, rule is not useful now """ time.sleep(uniform(1, 10)) print response.url # response = response.replace(url=HtmlParser.remove_url_parameter(response.url)) hxs = HtmlXPathSelector(response) index_level = self.determine_level(response) if index_level == 1: relative_urls = self.get_top_profile(2, hxs) if relative_urls is not None: for url in relative_urls: yield Request(url, callback=self.parse) elif index_level == 2: personProfile = HtmlParser.extract_person_profile(hxs) linkedin_id = self.get_linkedin_id(response.url) linkedin_id = UnicodeDammit(urllib.unquote_plus(linkedin_id)).markup if linkedin_id: personProfile['id'] = linkedin_id # personProfile['url'] = UnicodeDammit(response.url).markup self.mongodb_linkedin.rel_coll.update({'linkedin': response.url}, {'$set': dict(personProfile)}) print personProfile yield personProfile
def _poll_now(self, action_result, param): """ Poll data """ max_containers = param[phantom.APP_JSON_CONTAINER_COUNT] disable_max_containers = self.get_config().get('max_containers') single_page = False paging_data = { "page_cnt": 1, "alerts_per_page": 50, "total_pages": None } self.save_progress("start_time:{0}".format( param[phantom.APP_JSON_START_TIME])) # Convert from epoch tIf an ingestion is already in progresso ISO 8601 format dt_start = datetime.datetime.utcfromtimestamp( param[phantom.APP_JSON_START_TIME] / 1000) dt_start_formatted = datetime.datetime.strftime( dt_start, "%Y-%m-%dT%H:%M:%S") self.save_progress( "Fetching alerts from {0} to now".format(dt_start_formatted)) filter_value = (ARBORSIGHTLINE_GET_ALERTS_FILTER.format( time=dt_start_formatted)) # Percent-encode our filter query. filter_value = urllib.quote(filter_value, safe='') # Add query params filter_param = "filter={0}".format(filter_value) other_param = "include=annotations" params = [filter_param, other_param] # Filtering the amount of results per page if not disable_max_containers and max_containers < paging_data[ 'alerts_per_page']: paging_data['alerts_per_page'] = max_containers page_param = "perPage={0}".format(paging_data['alerts_per_page']) params.append(page_param) single_page = True url = "{0}?{1}".format(ARBORSIGHTLINE_GET_ALERTS_ENDPOINT, "&".join(params)) self.save_progress("Url={0}".format(url)) # Fetch alerts ret_val, response = self._get_alerts(action_result, url, paging_data) if (phantom.is_fail(ret_val)): try: self.error_print(action_result.get_status_message()) self.save_progress(action_result.get_status_message()) except: self.error_print(ARBORSIGHTLINE_GET_ALERTS_FAILED_MSG) self.save_progress(ARBORSIGHTLINE_GET_ALERTS_FAILED_MSG) return action_result.get_status() # Parse returned alerts ret_val, total_alerts = self._parse_alerts(action_result, response) if (phantom.is_fail(ret_val)): try: self.error_print(action_result.get_status_message()) self.save_progress(action_result.get_status_message()) except: self.error_print(ARBORSIGHTLINE_PARSE_ALERTS_FAILED_MSG) self.save_progress(ARBORSIGHTLINE_PARSE_ALERTS_FAILED_MSG) return action_result.get_status() # Handle case of no alerts found if total_alerts < 1: self.save_progress(ARBORSIGHTLINE_GET_ALERTS_EMPTY_MSG) action_result.set_status(phantom.APP_SUCCESS, ARBORSIGHTLINE_GET_ALERTS_EMPTY_MSG) return action_result.get_status() # Handle paging to fetch next alerts try: if not single_page: last_page_link = urllib.unquote( response['links']['last']).replace("&", "&") paging_data['total_pages'] = int( urlparse.parse_qs( urlparse.urlparse(last_page_link).query)['page'][0]) paging_data['page_cnt'] += 1 while paging_data['page_cnt'] <= paging_data['total_pages']: # Exit strategy with max containers if not disable_max_containers: remaining_alerts = max_containers - total_alerts if remaining_alerts <= 0: self.save_progress( "Maximum amount of containers reached: leaving.." ) break page_param = "page={0}".format(paging_data['page_cnt']) params = [filter_param, other_param, page_param] url = "{0}?{1}".format(ARBORSIGHTLINE_GET_ALERTS_ENDPOINT, "&".join(params)) ret_val, response = self._get_alerts( action_result, url, paging_data) if (phantom.is_fail(ret_val)): try: self.error_print( action_result.get_status_message()) self.save_progress( action_result.get_status_message()) except: self.error_print( ARBORSIGHTLINE_GET_ALERTS_FAILED_MSG) self.save_progress( ARBORSIGHTLINE_GET_ALERTS_FAILED_MSG) return action_result.get_status() # Eventually reduce amount of alerts to speed up processing if not disable_max_containers and remaining_alerts < paging_data[ 'alerts_per_page']: response['data'] = response['data'][:remaining_alerts] ret_val, page_alerts = self._parse_alerts( action_result, response) if (phantom.is_fail(ret_val)): try: self.error_print( action_result.get_status_message()) self.save_progress( action_result.get_status_message()) except: self.error_print( ARBORSIGHTLINE_PARSE_ALERTS_FAILED_MSG) self.save_progress( ARBORSIGHTLINE_PARSE_ALERTS_FAILED_MSG) return action_result.get_status() # Update counters paging_data['page_cnt'] += 1 total_alerts += page_alerts except Exception as e: try: if e.message: error_msg = UnicodeDammit( e.message).unicode_markup.encode('UTF-8') else: error_msg = "Error message unavailable" except: error_msg = "Unable to parse error message" return action_result.set_status( phantom.APP_ERROR, '{}. Error message: {}'.format( ARBORSIGHTLINE_GET_ALERTS_PAGINATION_FAILED_MSG, error_msg)) # if single-page closure # Save checkpoint self._state['last_ingested_epoch'] = param[phantom.APP_JSON_END_TIME] self.debug_print("Got new checkpoint: {}".format( self._state['last_ingested_epoch'])) return action_result.set_status(phantom.APP_SUCCESS)
def detect_encoding(html_content): # TODO: make a better version which does not ignore Content-Type # http://stackoverflow.com/questions/2686709/encoding-in-python-with-lxml-complex-solution from bs4 import UnicodeDammit ud = UnicodeDammit(html_content, is_html=True) return ud.original_encoding
def save_subtitles(video, subtitles, single=False, directory=None, encoding=None, encode_with=None, chmod=None, forced_tag=False, path_decoder=None): """Save subtitles on filesystem. Subtitles are saved in the order of the list. If a subtitle with a language has already been saved, other subtitles with the same language are silently ignored. The extension used is `.lang.srt` by default or `.srt` is `single` is `True`, with `lang` being the IETF code for the :attr:`~subliminal.subtitle.Subtitle.language` of the subtitle. :param video: video of the subtitles. :type video: :class:`~subliminal.video.Video` :param subtitles: subtitles to save. :type subtitles: list of :class:`~subliminal.subtitle.Subtitle` :param bool single: save a single subtitle, default is to save one subtitle per language. :param str directory: path to directory where to save the subtitles, default is next to the video. :param str encoding: encoding in which to save the subtitles, default is to keep original encoding. :return: the saved subtitles :rtype: list of :class:`~subliminal.subtitle.Subtitle` patch: unicode path probems """ saved_subtitles = [] for subtitle in subtitles: # check content if subtitle.content is None: logger.error('Skipping subtitle %r: no content', subtitle) continue # check language if subtitle.language in set(s.language for s in saved_subtitles): logger.debug('Skipping subtitle %r: language already saved', subtitle) continue # create subtitle path subtitle_path = get_subtitle_path( video.name, None if single else subtitle.language, forced_tag=forced_tag) if directory is not None: subtitle_path = os.path.join(directory, os.path.split(subtitle_path)[1]) if path_decoder: subtitle_path = path_decoder(subtitle_path) # force unicode subtitle_path = UnicodeDammit(subtitle_path).unicode_markup subtitle.storage_path = subtitle_path # save content as is or in the specified encoding logger.info('Saving %r to %r', subtitle, subtitle_path) has_encoder = callable(encode_with) if has_encoder: logger.info('Using encoder %s' % encode_with.__name__) # save normalized subtitle if encoder or no encoding is given if has_encoder or encoding is None: content = encode_with( subtitle.text) if has_encoder else subtitle.content with io.open(subtitle_path, 'wb') as f: f.write(content) # change chmod if requested if chmod: os.chmod(subtitle_path, chmod) if single: break continue # save subtitle if encoding given if encoding is not None: with io.open(subtitle_path, 'w', encoding=encoding) as f: f.write(subtitle.text) # change chmod if requested if chmod: os.chmod(subtitle_path, chmod) saved_subtitles.append(subtitle) # check single if single: break return saved_subtitles
artist = input("Enter the artist name: ") #calculating time taken in searching lyrics a = datetime.datetime.now() url_data = stripper(song, artist) # generate url path using stripper() url = 'https://genius.com/{}-lyrics'.format( url_data) # format the url with the url path page = requests.get(url) if page.status_code != 200: url_data = requests.get('https://aadibajpai.pythonanywhere.com/stripper', data={ 'song': song, 'artist': artist }).text url = 'https://genius.com/{}-lyrics'.format(url_data) page = requests.get(url) html = BeautifulSoup(page.text, "html.parser") # TODO: Add error handling lyrics_path = html.find( "div", class_="lyrics") # finding div on Genius containing the lyrics if lyrics_path is None: lyrics = 'Couldn\'t get lyrics for {song} by {artist}.\n'.format( song=song, artist=artist) else: lyrics = UnicodeDammit(lyrics_path.get_text().strip()).unicode_markup print(lyrics) b = datetime.datetime.now() delta = b - a print('Time taken in millisecond: ', delta.total_seconds() * 1000)
def _toUnicode(value): """Convert an unknown string to unicode """ if not isinstance(value, unicode): value = UnicodeDammit(value).unicode_markup return value
def save_subtitles(video, subtitles, single=False, directory=None, chmod=None, formats=("srt", ), forced_tag=False, path_decoder=None, debug_mods=False): """Save subtitles on filesystem. Subtitles are saved in the order of the list. If a subtitle with a language has already been saved, other subtitles with the same language are silently ignored. The extension used is `.lang.srt` by default or `.srt` is `single` is `True`, with `lang` being the IETF code for the :attr:`~subliminal.subtitle.Subtitle.language` of the subtitle. :param formats: list of "srt" and "vtt" :param video: video of the subtitles. :type video: :class:`~subliminal.video.Video` :param subtitles: subtitles to save. :type subtitles: list of :class:`~subliminal.subtitle.Subtitle` :param bool single: save a single subtitle, default is to save one subtitle per language. :param str directory: path to directory where to save the subtitles, default is next to the video. :return: the saved subtitles :rtype: list of :class:`~subliminal.subtitle.Subtitle` patch: unicode path problems """ logger.debug("Subtitle formats requested: %r", formats) saved_subtitles = [] for subtitle in subtitles: # check content if subtitle.content is None: logger.error('Skipping subtitle %r: no content', subtitle) continue # check language if subtitle.language in set(s.language for s in saved_subtitles): logger.debug('Skipping subtitle %r: language already saved', subtitle) continue # create subtitle path subtitle_path = get_subtitle_path( video.name, None if single else subtitle.language, forced_tag=forced_tag) if directory is not None: subtitle_path = os.path.join(directory, os.path.split(subtitle_path)[1]) if path_decoder: subtitle_path = path_decoder(subtitle_path) # force unicode subtitle_path = UnicodeDammit(subtitle_path).unicode_markup subtitle.storage_path = subtitle_path for format in formats: if format != "srt": subtitle_path = os.path.splitext(subtitle_path)[0] + (u".%s" % format) logger.debug(u"Saving %r to %r", subtitle, subtitle_path) content = subtitle.get_modified_content(format=format) if content: with open(subtitle_path, 'w') as f: f.write(content) else: logger.error( u"Something went wrong when getting modified subtitle for %s", subtitle) # change chmod if requested if chmod: os.chmod(subtitle_path, chmod) saved_subtitles.append(subtitle) # check single if single: break return saved_subtitles
def handle_text(self): ''' Takes care of converting body text to unicode, if its text at all. Sets self.original_encoding to original char encoding, and converts body to unicode if possible. Must come after handle_compression, and after self.mediaType is valid. ''' self.encoding = None # if the body is text if (self.mediaType and (self.mediaType.type == 'text' or (self.mediaType.type == 'application' and 'xml' in self.mediaType.subtype))): # if there was a charset parameter in HTTP header, store it if 'charset' in self.mediaType.params: override_encodings = [self.mediaType.params['charset']] else: override_encodings = [] # if there even is data (otherwise, # dammit.originalEncoding might be None) if self.body != '': if UnicodeDammit: # honestly, I don't mind not abiding by RFC 2023. # UnicodeDammit just does what makes sense, and if the # content is remotely standards-compliant, it will do the # right thing. dammit = UnicodeDammit(self.body, override_encodings) self.text = dammit.unicode_markup self.originalEncoding = dammit.original_encoding # if unicode was found #if dammit.unicode: # self.text = dammit.unicode # self.originalEncoding = dammit.originalEncoding #else: # # unicode could not be decoded, at all # # HAR can't write data, but body might still # # be useful as-is # pass else: # try the stupid version, just guess content-type or utf-8 u = None # try our list of encodings + utf8 with strict errors for e in override_encodings + ['utf8', 'iso-8859-1']: try: u = self.body.decode(e, 'strict') self.originalEncoding = e break # if ^^ didn't throw, we're done except UnicodeError: pass # if none of those worked, try utf8 # with 'replace' error mode if not u: # unicode has failed u = self.body.decode('utf8', 'replace') self.originalEncoding = None # ??? self.text = u or None else: # body is not text # base64 encode it and set self.encoding # TODO: check with list that this is right self.text = b64encode(self.body) self.encoding = 'base64'
def beautify(self, data, charset): dammit = UnicodeDammit(data, [charset, "utf-8", "gb2312", "gbk", "big5", "gb18030"], smart_quotes_to="html") data = dammit.unicode_markup return data
def parse(url, path, name): #init variables spaces_regex = re.compile("^(\s*).*") location_regex = re.compile("^\s*(INT\.|EXT\.)") BLOCK_TYPES = [ 'character', 'speech', 'stage direction', 'location', 'unknown' ] CHARACTER = 0 SPEECH = 1 DIRECTIONS = 2 LOCATION = 3 time_start = time.time() if url.endswith('.pdf'): print('The file @ %s is a PDF' % (url)) return script_text, soup = get_script(url) #write raw file: if not os.path.exists(path + 'raw/'): os.makedirs(path + 'raw/') with open(path + 'raw/' + "%s.txt" % name, "w") as text_file: text_file.write(str(script_text)) ##### space_vector, character_presence = white_space_analysis(script_text, soup) usual_spaces, flag = identify_usual_spaces(space_vector, character_presence) # Ici on définit les variables qu'on remplira de texte is_intro = True movie_script = [] intro = [] last_line_type = -1 last_character = 'unknown' text = [] characters = [] for block in script_text.descendants: # Si block est une instance de bs4.Tag, il est entouré de balises HTML # Le prochain block contiendra le même texte sans les balises # Donc on continue sans parser ce bloc if (isinstance(block, Tag)): continue # UnicodeDammit converts any string to UTF-8 # does not work so well block = UnicodeDammit(block, soup.original_encoding).unicode_markup # remove leading and ending end of lines block = block.strip('\n').strip('\n\r') # if the block doesn't have any text, skip it if (re.search('\w', block) == None): continue for line in block.split('\n'): stripped_line = line.strip(' \n\t\r') if (re.search('\w', line) == None): continue # Counting the number of spaces at the beginning of the line spmatch = spaces_regex.search(line) space_vector.append(len(spmatch.group(1))) #print(block) #print(line) #print(len(spmatch.group(1))) line_type = get_line_type(line, stripped_line, usual_spaces) #print(line_type) #print(line) if (last_line_type == -1 # -1 = not initialized or last_line_type == line_type): text.append(stripped_line) else: if (last_line_type == CHARACTER): last_character = '\n'.join( text ) #regex to supress (parenthesis) & replicate speaker if not last_character in characters: characters.append(last_character) elif (last_line_type == SPEECH): movie_script.append({ 'type': BLOCK_TYPES[last_line_type], BLOCK_TYPES[CHARACTER]: last_character, 'text': '\n'.join(text) }) #print('We just parsed this JSON block:') #print(movie_script[-1]) else: movie_script.append({ 'type': BLOCK_TYPES[last_line_type], 'text': '\n'.join(text) }) #print('We just parsed this JSON block:') #print(movie_script[-1]) text = [stripped_line] last_line_type = line_type #print('----------------') result = json_normalize(movie_script) if flag: write_csv(result, name, path) print(' Done parsing script at %s in %s' % (url, time.time() - time_start)) print('-----------------') return (result) else: path = path + 'doubtful/' write_csv(result, name, path) print(' Done parsing script at %s in %s' % (url, time.time() - time_start)) print('-----------------') return (result)
if __name__ == "__main__": file_list = os.listdir() if file_list.count('chs') == 0: os.mkdir('./chs') file_list = [ x for x in file_list if os.path.isfile(x) and (os.path.splitext(x)[1] == '.ass' or os.path.splitext(x)[1] == '.srt') ] print(file_list) for i in file_list: # 以二进制读入数据 with open(i, 'rb') as b: buf = b.read() # result = chardet.detect(buf) # 解析编码 result2 = UnicodeDammit(buf) print('Encoding: ', result2.original_encoding) with open(i, 'r', encoding=result2.original_encoding, errors='ignore') as text: text_all = text.read() #print(text_all) with open(os.path.join('./chs', i), 'w', encoding=result2.original_encoding, errors='ignore') as text: text.write(Traditional2Simplified(text_all)) print(i, ' is converted') print('Success!')
# -*- coding:utf-8 -*- from const_var import * import downloader import html2text from bs4 import UnicodeDammit def html_to_txt(html): h = html2text.HTML2Text() h.ignore_links = True return h.handle(html) pass # ---|---|---|--- if __name__ == '__main__': url = "/webfile/jiaozuo/cgxx/jggg/webinfo/2017/07/1498553692194150.htm" # url = "/webfile/luoyang/zgxx/jggg/webinfo/2017/07/1498553690093831.htm" url = "%s%s" % (mainHTTP, url) html = downloader.Downloader()(url) dommit = UnicodeDammit(html) text = html_to_txt(dommit.unicode_markup) if re.search(r'-+\|+', text, re.DOTALL): print True # print text11
def decode(str_, is_html=False, errors='strict'): if isinstance(str_, unicode): return str_ if isinstance(str_, str): return UnicodeDammit(str_, ['utf-8'], is_html=is_html).unicode_markup return unicode(str_, 'utf-8', errors)
def load_from_html(filename, use_boilerpipe=True, use_nltk=True, use_regex=True, binary=False, field=['title', 'body']): if binary: charset = UnicodeDammit(open(filename, 'rb').read()) charset = charset.original_encoding try: content = open(filename, 'r', encoding=charset).read() except Exception as e: # if has error, return empty results logging.warn('encode error: {}, {}'.format(filename, e)) return {'title': [], 'body': []} else: content = open(filename, 'r', encoding='utf-8').read() start = time.time() if not use_regex or not use_boilerpipe: bs = BeautifulSoup(content, 'html.parser') if 'title' in field: if use_regex: match = re.search(r'<title.*?>(.+?)</title>', content[:5000], re.DOTALL | re.IGNORECASE) title = match.group(1) if match else '' title = html.unescape(title).strip() else: if bs.title != None and bs.title.string != None: title = bs.title.string.strip() else: title = '' t1 = time.time() - start start = time.time() if 'body' in field: if use_boilerpipe: extractor = Extractor(extractor='ArticleExtractor', html=content) # time consuming body = extractor.getText() else: body = bs.select('body') if len(body) <= 0: body = bs else: body = body[0] # remove all useless label [x.extract() for x in body.findAll('script')] [x.extract() for x in body.findAll('style')] [x.extract() for x in body.findAll('meta')] [x.extract() for x in body.findAll('link')] body = body.text t2 = time.time() - start start = time.time() result = {} if 'title' in field: result['title'] = my_word_tokenize(title) if use_nltk else clean_text( title).split(' ') if 'body' in field: result['body'] = my_word_tokenize(body) if use_nltk else clean_text( body).split(' ') t3 = time.time() - start #print('{}\t{}\t{}'.format(t1, t2, t3)) return result
def importFitFromFiles(paths, iportuser=None): """ Imports fits from file(s). First processes all provided paths and stores assembled fits into a list. This allows us to call back to the GUI as fits are processed as well as when fits are being saved. returns """ sFit = svcFit.getInstance() fit_list = [] try: for path in paths: if iportuser: # Pulse msg = "Processing file:\n%s" % path pyfalog.debug(msg) processing_notify( iportuser, IPortUser.PROCESS_IMPORT | IPortUser.ID_UPDATE, msg) # wx.CallAfter(callback, 1, msg) with open(path, "rb") as file_: srcString = file_.read() dammit = UnicodeDammit(srcString) srcString = dammit.unicode_markup if len(srcString) == 0: # ignore blank files pyfalog.debug("File is blank.") continue try: _, fitsImport = Port.importAuto(srcString, path, iportuser=iportuser) fit_list += fitsImport except xml.parsers.expat.ExpatError: pyfalog.warning("Malformed XML in:\n{0}", path) return False, "Malformed XML in %s" % path # IDs = [] # NOTE: what use for IDs? numFits = len(fit_list) for idx, fit in enumerate(fit_list): # Set some more fit attributes and save fit.character = sFit.character fit.damagePattern = sFit.pattern fit.targetResists = sFit.targetResists if len(fit.implants) > 0: fit.implantLocation = ImplantLocation.FIT else: useCharImplants = sFit.serviceFittingOptions[ "useCharacterImplantsByDefault"] fit.implantLocation = ImplantLocation.CHARACTER if useCharImplants else ImplantLocation.FIT db.save(fit) # IDs.append(fit.ID) if iportuser: # Pulse pyfalog.debug( "Processing complete, saving fits to database: {0}/{1}", idx + 1, numFits) processing_notify( iportuser, IPortUser.PROCESS_IMPORT | IPortUser.ID_UPDATE, "Processing complete, saving fits to database\n(%d/%d) %s" % (idx + 1, numFits, fit.ship.name)) except UserCancelException: return False, "Processing has been canceled.\n" except Exception as e: pyfalog.critical("Unknown exception processing: {0}", path) pyfalog.critical(e) # TypeError: not all arguments converted during string formatting # return False, "Unknown Error while processing {0}" % path return False, "Unknown error while processing %s\n\n Error: %s" % ( path, e.message) return True, fit_list
def feed(self, data): sgmllib.SGMLParser.feed(self, UnicodeDammit(data).unicode_markup)
def guess_encoding(self): """Guess encoding using the language, falling back on chardet. :return: the guessed encoding. :rtype: str """ if self._guessed_encoding: return self._guessed_encoding logger.info('Guessing encoding for language %s', self.language) encodings = ['utf-8'] # add language-specific encodings # http://scratchpad.wikia.com/wiki/Character_Encoding_Recommendation_for_Languages if self.language.alpha3 == 'zho': encodings.extend( ['cp936', 'gb2312', 'cp950', 'gb18030', 'big5', 'big5hkscs']) elif self.language.alpha3 == 'jpn': encodings.extend([ 'shift-jis', 'cp932', 'euc_jp', 'iso2022_jp', 'iso2022_jp_1', 'iso2022_jp_2', 'iso2022_jp_2004', 'iso2022_jp_3', 'iso2022_jp_ext', ]) elif self.language.alpha3 == 'tha': encodings.extend(['tis-620', 'cp874']) # arabian/farsi elif self.language.alpha3 in ('ara', 'fas', 'per'): encodings.append('windows-1256') elif self.language.alpha3 == 'heb': encodings.extend(['windows-1255', 'iso-8859-8']) elif self.language.alpha3 == 'tur': encodings.extend(['windows-1254', 'iso-8859-9', 'iso-8859-3']) # Greek elif self.language.alpha3 in ('grc', 'gre', 'ell'): encodings.extend([ 'windows-1253', 'cp1253', 'cp737', 'iso8859-7', 'cp875', 'cp869', 'iso2022_jp_2', 'mac_greek' ]) # Polish, Czech, Slovak, Hungarian, Slovene, Bosnian, Croatian, Serbian (Latin script), # Romanian and Albanian elif self.language.alpha3 in ('pol', 'cze', 'ces', 'slk', 'slo', 'slv', 'hun', 'bos', 'hbs', 'hrv', 'rsb', 'ron', 'rum', 'sqi', 'alb'): encodings.extend(['windows-1250', 'iso-8859-2']) # Eastern European Group 1 if self.language.alpha3 == "slv": encodings.append('iso-8859-4') # Albanian elif self.language.alpha3 in ("sqi", "alb"): encodings.extend([ 'windows-1252', 'iso-8859-15', 'iso-8859-1', 'iso-8859-9' ]) # Bulgarian, Serbian and Macedonian, Ukranian and Russian elif self.language.alpha3 in ('bul', 'srp', 'mkd', 'mac', 'rus', 'ukr'): # Eastern European Group 2 if self.language.alpha3 in ('bul', 'mkd', 'mac', 'rus', 'ukr'): encodings.extend(['windows-1251', 'iso-8859-5']) elif self.language.alpha3 == 'srp': if self.language.script == "Latn": encodings.extend(['windows-1250', 'iso-8859-2']) elif self.language.script == "Cyrl": encodings.extend(['windows-1251', 'iso-8859-5']) else: encodings.extend([ 'windows-1250', 'windows-1251', 'iso-8859-2', 'iso-8859-5' ]) else: # Western European (windows-1252) / Northern European encodings.extend([ 'latin-1', 'iso-8859-15', 'iso-8859-9', 'iso-8859-4', 'iso-8859-1' ]) # try to decode logger.debug('Trying encodings %r', encodings) for encoding in encodings: try: self.content.decode(encoding) except UnicodeDecodeError: pass else: logger.info('Guessed encoding %s', encoding) self._guessed_encoding = encoding return encoding logger.warning('Could not guess encoding from language') # fallback on chardet encoding = chardet.detect(self.content)['encoding'] logger.info('Chardet found encoding %s', encoding) if not encoding: # fallback on bs4 logger.info('Falling back to bs4 detection') a = UnicodeDammit(self.content) Log.Debug("bs4 detected encoding: %s" % a.original_encoding) if a.original_encoding: self._guessed_encoding = a.original_encoding return a.original_encoding raise ValueError(u"Couldn't guess the proper encoding for %s" % self) self._guessed_encoding = encoding return encoding
def main(argv=None): """ Handles command line arguments and gets things started. Parameters ---------- argv : list of str List of arguments, as if specified on the command-line. If None, ``sys.argv[1:]`` is used instead. """ # Get command line arguments parser = argparse.ArgumentParser( description="Takes an input feature file and converts it to another \ format. Formats are determined automatically from file \ extensions.", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('infile', help='input feature file (ends in .arff, .csv, \ .jsonlines, .libsvm, .megam, .ndj, or .tsv)') parser.add_argument('outfile', help='output feature file (ends in .arff, .csv, \ .jsonlines, .libsvm, .megam, .ndj, or .tsv)') parser.add_argument('-i', '--id_col', help='Name of the column which contains the instance \ IDs in ARFF, CSV, or TSV files.', default='id') parser.add_argument('-l', '--label_col', help='Name of the column which contains the class \ labels in ARFF, CSV, or TSV files. For ARFF \ files, this must be the final column to count as\ the label.', default='y') parser.add_argument('-q', '--quiet', help='Suppress printing of "Loading..." messages.', action='store_true') parser.add_argument('--arff_regression', help='Create ARFF files for regression, not \ classification.', action='store_true') parser.add_argument('--arff_relation', help='Relation name to use for ARFF file.', default='skll_relation') parser.add_argument('--reuse_libsvm_map', help='If you want to output multiple files that use \ the same mapping from labels and features to \ numbers when writing libsvm files, you can \ specify an existing .libsvm file to reuse the \ mapping from.', type=argparse.FileType('rb')) parser.add_argument('--version', action='version', version='%(prog)s {0}'.format(__version__)) args = parser.parse_args(argv) # Make warnings from built-in warnings module get formatted more nicely logging.captureWarnings(True) logging.basicConfig(format=('%(asctime)s - %(name)s - %(levelname)s - ' '%(message)s')) logger = logging.getLogger(__name__) # make sure the input file extension is one we can process input_extension = os.path.splitext(args.infile)[1].lower() output_extension = os.path.splitext(args.outfile)[1].lower() if input_extension not in EXT_TO_READER: logger.error(('Input file must be in either .arff, .csv, .jsonlines, ' '.libsvm, .megam, .ndj, or .tsv format. You specified: ' '{}').format(input_extension)) sys.exit(1) # Build feature and label vectorizers from existing libsvm file if asked if args.reuse_libsvm_map and output_extension == '.libsvm': feat_map = {} label_map = {} for line in args.reuse_libsvm_map: line = UnicodeDammit(line, ['utf-8', 'windows-1252']).unicode_markup if '#' not in line: logger.error('The LibSVM file you want to reuse the map from ' 'was not created by SKLL and does not actually ' 'contain the necessary mapping info.') sys.exit(1) comments = line.split('#')[1] _, label_map_str, feat_map_str = comments.split('|') feat_map.update( _pair_to_dict_tuple(pair) for pair in feat_map_str.strip().split()) label_map.update( _pair_to_dict_tuple(pair) for pair in label_map_str.strip().split()) feat_vectorizer = DictVectorizer() feat_vectorizer.fit([{name: 1} for name in feat_map]) feat_vectorizer.vocabulary_ = feat_map else: feat_vectorizer = None label_map = None # Iterate through input file and collect the information we need reader = EXT_TO_READER[input_extension](args.infile, quiet=args.quiet, label_col=args.label_col, id_col=args.id_col) feature_set = reader.read() # write out the file in the requested output format writer_type = EXT_TO_WRITER[output_extension] writer_args = {'quiet': args.quiet} if writer_type is DelimitedFileWriter: writer_args['label_col'] = args.label_col writer_args['id_col'] = args.id_col elif writer_type is ARFFWriter: writer_args['label_col'] = args.label_col writer_args['id_col'] = args.id_col writer_args['regression'] = args.arff_regression writer_args['relation'] = args.arff_relation elif writer_type is LibSVMWriter: writer_args['label_map'] = label_map writer = writer_type(args.outfile, feature_set, **writer_args) writer.write()
def _parse_alerts(self, action_result, alerts): """ Parse alerts to create containers and artifacts """ alerts_cnt = 0 # What happens if you do not have alerts returned? # data = [] --> returns alerts_cnt = 0 if alerts.get('data') is None: action_result.set_status( phantom.APP_ERROR, ARBORSIGHTLINE_ALERTS_DATA_KEY_UNAVAILABLE_MSG) return action_result.get_status(), None try: for data in alerts['data']: alert_id = data['id'] target_address = data['attributes']['subobject'][ 'host_address'] impact_bps = data['attributes']['subobject']['impact_bps'] impact_pps = data['attributes']['subobject']['impact_pps'] victim_router = data['attributes']['subobject'][ 'impact_boundary'] classification = data['attributes']['classification'] description = "" for include in alerts['included']: if include['relationships']['parent']['data'][ 'type'] == 'alert' and include['relationships'][ 'parent']['data']['id'] == alert_id: description = include['attributes']['text'] break # Creating container c = { 'data': {}, 'description': 'Ingested from Arbor Sightline', 'source_data_identifier': alert_id, 'name': '{0} {1}'.format(classification, target_address) } # self.send_progress('Saving container for alert id {0}...'.format(alert_id)) status, msg, id_ = self.save_container(c) # self.save_progress("Container id : {}, {}, {}".format(id_, status, msg)) if status == phantom.APP_ERROR: action_result.set_status( phantom.APP_ERROR, ARBORSIGHTLINE_CREATE_CONTAINER_FAILED_MSG.format(msg)) return action_result.get_status(), None # Creating artifacts cef = { 'targetAddress': target_address, 'impactBps': impact_bps, 'impactPps': impact_pps, 'victimRouter': victim_router, 'classification': classification, 'description': description } art = { 'container_id': id_, 'name': 'Event Artifact', 'label': 'event', 'source_data_identifier': c['source_data_identifier'], 'cef': cef, 'run_automation': True } # self.send_progress('Saving artifact...') status, msg, id_ = self.save_artifact(art) if status == phantom.APP_ERROR: action_result.set_status( phantom.APP_ERROR, ARBORSIGHTLINE_CREATE_ARTIFACT_FAILED_MSG.format(msg)) return action_result.get_status(), None alerts_cnt += 1 except Exception as e: try: if e.message: error_msg = UnicodeDammit( e.message).unicode_markup.encode('UTF-8') else: error_msg = "Error message unavailable" except: error_msg = "Unable to parse error message" action_result.set_status( phantom.APP_ERROR, '{}. Error message: {}'.format( ARBORSIGHTLINE_PARSE_ALERTS_FAILED_MSG, error_msg)) return action_result.get_status(), None return phantom.APP_SUCCESS, alerts_cnt
words[w] = [line] header = header + '\t' + '\t'.join([ 'F1_man', 'F2_man', 'F3_man', 'plt_code', 'plt_stress', 'plt_word', 't_man' ]) + '\n' fw = open(outputFile, 'w') fw.write(header) plt_lines = open(pltFile, 'rb').readlines() skipped_lines = [] # skip the first two lines since they contain header information for plt_line in plt_lines[2:]: print(plt_line) plt_line = UnicodeDammit(plt_line, ['utf-8', 'windows-1252']).unicode_markup plt_line = unidecode(plt_line) plt_line = plt_line.rstrip() plt_F1 = plt_line.split(',')[0] # a line beginning with '1' is the first line of the vowel means; this # signals the end of the vowel token measurements, so we can stop # processing the file if plt_F1 == '1': break plt_w_raw = plt_line.split(',')[5].split(' ')[0] plt_w = plt_w_raw.upper() plt_w = plt_w.replace('(', '') plt_w = plt_w.replace(')', '') print(plt_w) if plt_w not in words:
is_intro = True movie_script = [] intro = [] last_line_type = -1 last_character = '' text = [] characters=[] for block in script_text.descendants: if(isinstance(block, Tag)): continue # UnicodeDammit converts any string to UTF-8 # does not work so well block = UnicodeDammit(block, soup.original_encoding).unicode_markup # remove leading and ending end of lines block = block.strip('\n') # if the block doesn't have any text, skip it if( re.search('\w', block) == None ): continue # bs4 ne coupe pas toujours bien les différents blocs # Mieux vaut donc redécouper par paragraphe et les traiter un à un for line in block.split('\n'): stripped_line = line.strip(' \n\t\r') if( re.search('\w', line) == None ): continue line_type = get_line_type(line, stripped_line, usual_spaces)
print(soup.p.encode('latin-1')) print(soup.p.encode("utf-8")) # 编码不支持会转换成特殊字符引用 markup = u"<b>\N{SNOWMAN}</b>" snowman_soup = BeautifulSoup(markup) tag = snowman_soup.b print(tag.encode('utf-8')) print(tag.encode('latin-1')) print(tag.encode('ascii')) ### 编码自动检测 from bs4 import UnicodeDammit dammit = UnicodeDammit("Sacr\xc3\xa9 bleu!") print(dammit.unicode_markup) print(dammit.original_encoding) dammit = UnicodeDammit("Sacr\xe9 bleu!", ["latin-1", "iso-8859-1"]) print(dammit.unicode_markup) print(dammit.original_encoding) # 智能引号,使用Unicode时,会自能地把引号转换成HTML的特殊字符 markup = b"<p>I just \x93love\x94 Microsoft Word\x92s smart quotes</p>" print( UnicodeDammit(markup, ['windows-1252'], smart_quotes_to='html').unicode_markup) print( UnicodeDammit(markup, ['windows-1252'],