示例#1
0
文件: markov.py 项目: jdpls/Servrhe
    def learn(self, name, phrase, channel):
        name = self.aliases.resolve(name)
        if name not in self.users:
            self.users[name] = True

        if "password" in phrase:
            return
        phrase = phrase.split(" ")
        phrase = filter(lambda x: x and "http" not in x and "ftp:" not in x and x[0] != ".", phrase)
        now = datetime.datetime.utcnow()
        documents = []

        for i in range(len(phrase) + 1):
            seed = UnicodeDammit.detwingle(phrase[i-1] if i > 0 else "")
            answer = UnicodeDammit.detwingle(phrase[i] if i < len(phrase) else "")

            documents.append({
                "name": name,
                "seed": seed,
                "answer": answer,
                "added": now,
                "random": random.random()
            })

        yield self.db.insert(documents, safe=True)
示例#2
0
文件: markov.py 项目: jdpls/Servrhe
    def ramble(self, name=None, seed=""):
        if name:
            name = self.aliases.resolve(name)
            if name not in self.users:
                returnValue("")

        message = []

        if seed:
            seed = UnicodeDammit.detwingle(seed)
            chunk = seed
            while chunk and len(" ".join(message)) < 300:
                message.append(chunk)
                chunk = yield self.prev(name, chunk)
            message.reverse()

        chunk = yield self.next(name, seed)
        while chunk and len(" ".join(message)) < 300:
            message.append(chunk)
            chunk = yield self.next(name, chunk)
            if not chunk and len(" ".join(message)) < 30:
                chunk = yield self.next(name, chunk)

        response = (" ".join(message)).decode("utf8")
        if seed and response == seed.decode("utf8"):
            response = yield self.ramble(name)
        returnValue(response)
示例#3
0
 def __init__(self,url):# logs info,warning,error,critical,debug events.
     '''
     Description: This is the class constructor and is going to get a simple url as input and parse it based on RFC1738.
     Status: In Progress.
     Usage: This is going to be used by by the connection manager and the active/passive scanner to extract url variables.
     '''
     self.url = UnicodeDammit.detwingle(url, 'UTF-8')        
     self.defaultHttpsPort = 443
     self.defaultHttpPort = 80
     urlLogger.logInfo("--- Package: UrlManager - Module: UrlHandler Class: urlHandler Initiated ---")
示例#4
0
   def selectdir(geturl):
      r = scraper.get(geturl, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
      rt = UnicodeDammit.detwingle(r.text)
      html = BeautifulSoup(rt.decode('utf-8'), "html.parser")
      if debug == 1:
         orenc = str(html.original_encoding)
         print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
      findlinks = html.findAll('a')
      dirlist = []
      for link in findlinks:
         b = link.get('href')
         if not re.match(r'^((\.\.)?\/)$', str(b)):
            if re.search(r'^(.*)(\/)$', str(b)):
               dirlist.append(b)

      p = urlparse(geturl)
      part = p.path.split('/')[-1]
      path = p.path.rstrip(part)
      if '/' not in path[:1]:
         path = '/' + path
      urlfqdn = p.scheme + '://' + p.netloc
      parent = urlfqdn + path

      i = 0
      dirtotal = len(dirlist)
      if dirtotal > 0:
         print('\nFOUND %d DIRECTORIES: \n' % dirtotal)
         while i < dirtotal:
            sel = i + 1
            print(str(sel) + ' - ' + str(dirlist[i]))
            i += 1
         print('')
         lim = dirtotal + 1
         matchtop = r'^(%s)(\/)?$' % urlfqdn
         if not re.match(matchtop,geturl):
            print('0 - BACK TO PARENT DIRECTORY \n')
            startsel = '0-%d' % dirtotal
         else:
            startsel = '1-%d' % dirtotal
         selectdir = raw_input('make a selection [%s] --> ' % startsel)
         if not int(selectdir) in range(0, lim):
            selectdir = raw_input('invalid entry. please enter a selection %s --> ' % startsel)
         if selectdir == '0':
            geturl = parent
            subcont = 0
         else:
            n = int(selectdir) - 1
            usedir = dirlist[n]
            geturl = parent + usedir
            subcont = 1
      else:
         print('\nNO DIRECTORIES FOUND. using current directory.. \n')
         subcont = 0
         geturl = parent + part
      return geturl, subcont, parent
示例#5
0
    def formatForReddit(self, feedEntry, postType, subreddit, raw):
        if 'content' in feedEntry:
          content = feedEntry['content'][0]['value']
        elif 'description' in feedEntry:
          content = feedEntry.description
        else:
          content = ''
        logging.debug(content)
        parser = EveRssHtmlParser()
        
        title = feedEntry['title']

        # some feeds like Twitter are raw so the parser hates it.
        if (raw):
          regex_of_url = '(https?:\/\/[\dA-z\.-]+\.[A-z\.]{2,6}[\/\w&=#\.\-\?]*)'
          title = re.sub(regex_of_url, '', title)
          clean_content = content.replace(' pic.twitter.com', ' http://pic.twitter.com')
          clean_content = re.sub(regex_of_url, '<a href="\\1">link</a>', clean_content)
          clean_content = UnicodeDammit.detwingle(clean_content)
          #logging.info(clean_content)
          u = UnicodeDammit(clean_content, 
                      smart_quotes_to='html', 
                      is_html = False )
          # fix twitter putting ellipses on the end
          content = u.unicode_markup.replace(unichr(8230),' ...')
          logging.debug('.....')
        
        if "tumblr.com" in content:
          # Replace with larger images (hopefully such images exist)
          content = content.replace('_500.', '_1280.')
        
        # Added the .replace because the parser does something funny to them and 
        # removes them before I can handle them
        content = content.replace('&nbsp;', ' ')
        content = content.replace('&bull;', '*').replace('&middot;','*')
        content = content.replace('&ldquo;','\'').replace('&rdquo;','\'')
        content = re.sub('( [ ]+)', ' ', content)
        parser.feed(content)
        parser.comments[0] = '%s\n\n%s' %(feedEntry['link'], parser.comments[0])
        parser.comments[-1] += self.config['signature']
        
        if 'author' in feedEntry:
          author = '~' + feedEntry['author'].replace('@', ' at ')
        else:
          author = ''

        return {'comments': parser.comments,
                'link':     feedEntry['link'],
                'subreddit': subreddit,
                'title':    '[%s] %s %s' %(postType, title, author)}
示例#6
0
文件: irc.py 项目: skiddiks/Servrhe
def normalize(s):
    if isinstance(s, unicode):
        return s

    try:
        u = s.decode("utf8")
    except:
        try:
            u = (s[:-1]).decode("utf8")
        except:
            try:
                u = UnicodeDammit.detwingle(s).decode("utf8")
            except:
                u = UnicodeDammit(s, ["utf8", "windows-1252"]).unicode_markup

    return u
示例#7
0
def make_id3str(kwargs):
	'''
	take existing id3 info and turn it into something ffmpeg understands
	'''
	args = ut.dotdict(kwargs)
	id3str = ''
	for index, tag in enumerate(args.id3fields): #loop thru the raw list of id3 values, grip the index
		if tag is not None:
			if args.id3rawlist[index] is not None:
				id3str = id3str + " -metadata " + tag + '"' + args.id3rawlist[index].replace('"','') + '"'
	if not "album=" in args.id3fields:
		id3str = id3str + ' -metadata album="' + args.assetName + '" -metadata publisher="UCSB Special Research Collections"'
	else:
		id3str = id3str + ' -metadata publisher="UCSB Special Research Collections"'
	id3str = UnicodeDammit.detwingle(id3str)
	return id3str
示例#8
0
文件: irc.py 项目: rcombs/Servrhe
def normalize(s):
    if isinstance(s, unicode):
        return s

    try:
        u = s.decode("utf8")
    except:
        try:
            u = (s[:-1]).decode("utf8")
        except:
            try:
                u = UnicodeDammit.detwingle(s).decode("utf8")
            except:
                u = UnicodeDammit(s, ["utf8", "windows-1252"]).unicode_markup

    return u
示例#9
0
 def getpage(cfurl):      
    r = scraper.get(cfurl, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
    if 'text' in r.headers.get('Content-Type'):
       rt = UnicodeDammit.detwingle(r.text)
       html = BeautifulSoup(rt.decode('utf-8'), "html.parser")
       print('\r\n--------------------------------------------------------\r\n')
       if debug == 1:
          orenc = str(html.original_encoding)
          print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
       bs = html.prettify(formatter=None)
       print(bs)
       print('\r\n--------------------------------------------------------\r\n')
    else:
       found = -1
    
    if debug == 1:
       print('\n\033[34mDEBUG: finished list length: \033[37;1m%d \033[0m\n' % len(finished))
示例#10
0
    def run(self, params={}):
        try:
            eml_file = base64.b64decode(params.get(
                Input.EML_FILE)).decode('utf-8')
        except Exception as ex:
            self.logger.debug(ex)
            self.logger.debug(
                "Failed to parse message as UTF-8, attempting to detwingle first before retrying parse"
            )
            eml_file = UnicodeDammit.detwingle(
                base64.b64decode(params.get(Input.EML_FILE))).decode(
                    'utf-8', errors='ignore')

        msg = email.message_from_string(eml_file)

        result = format_output.format_result(self.logger, msg)
        return {Output.RESULT: result}
示例#11
0
文件: crunchy.py 项目: rcombs/Servrhe
    def format(self, script):
        dammit = UnicodeDammit.detwingle(script)
        soup = BeautifulSoup(dammit, from_encoding="utf8")
        header = soup.find('subtitle_script')
        header = "[Script Info]\nTitle: "+header['title']+"\nScriptType: v4.00+\nWrapStyle: "+header['wrap_style']+"\nPlayResX: 624\nPlayResY: 366\nScaledBorderAndShadow: yes\nYCbCr Matrix: TV.709\n\n";
        styles = "[V4+ Styles]\nFormat: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n";
        events = "\n[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n";
        stylelist = soup.findAll('style')
        eventlist = soup.findAll('event')
        
        for style in stylelist:
            styles += "Style: " + style['name'] + "," + style['font_name'] + "," + style['font_size'] + "," + style['primary_colour'] + "," + style['secondary_colour'] + "," + style['outline_colour'] + "," + style['back_colour'] + "," + style['bold'] + "," + style['italic'] + "," + style['underline'] + "," + style['strikeout'] + "," + style['scale_x'] + "," + style['scale_y'] + "," + style['spacing'] + "," + style['angle'] + "," + style['border_style'] + "," + style['outline'] + "," + style['shadow'] + "," + style['alignment'] + "," + style['margin_l'] + "," + style['margin_r'] + "," + style['margin_v'] + "," + style['encoding'] + "\n"

        for event in eventlist:
            events += "Dialogue: 0,"+event['start']+","+event['end']+","+event['style']+","+event['name']+","+event['margin_l']+","+event['margin_r']+","+event['margin_v']+","+event['effect']+","+event['text']+"\n"

        formattedSubs = header+styles+events
        return formattedSubs
示例#12
0
    def format(self, script):
        dammit = UnicodeDammit.detwingle(script)
        soup = BeautifulSoup(dammit, from_encoding="utf8")
        header = soup.find('subtitle_script')
        header = "[Script Info]\nTitle: "+header['title']+"\nScriptType: v4.00+\nWrapStyle: "+header['wrap_style']+"\nPlayResX: 624\nPlayResY: 366\nScaledBorderAndShadow: yes\nYCbCr Matrix: TV.709\n\n";
        styles = "[V4+ Styles]\nFormat: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding\n";
        events = "\n[Events]\nFormat: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text\n";
        stylelist = soup.findAll('style')
        eventlist = soup.findAll('event')
        
        for style in stylelist:
            styles += "Style: " + style['name'] + "," + style['font_name'] + "," + style['font_size'] + "," + style['primary_colour'] + "," + style['secondary_colour'] + "," + style['outline_colour'] + "," + style['back_colour'] + "," + style['bold'] + "," + style['italic'] + "," + style['underline'] + "," + style['strikeout'] + "," + style['scale_x'] + "," + style['scale_y'] + "," + style['spacing'] + "," + style['angle'] + "," + style['border_style'] + "," + style['outline'] + "," + style['shadow'] + "," + style['alignment'] + "," + style['margin_l'] + "," + style['margin_r'] + "," + style['margin_v'] + "," + style['encoding'] + "\n"

        for event in eventlist:
            events += "Dialogue: 0,"+event['start']+","+event['end']+","+event['style']+","+event['name']+","+event['margin_l']+","+event['margin_r']+","+event['margin_v']+","+event['effect']+","+event['text']+"\n"

        formattedSubs = header+styles+events
        return formattedSubs
示例#13
0
    def run(self, params={}):
        data = params.get(Input.DATA)
        try:
            decoded = base64.b64decode(data).decode('utf-8')
        except Exception as ex:
            self.logger.debug("Error decoding")
            self.logger.debug(ex)
            decoded = UnicodeDammit.detwingle(data).decode('utf-8',
                                                           errors='ignore')
        pattern = params.get(Input.PATTERN)
        behavior = params.get(Input.BEHAVIOR)

        output = utils.process_grep(
            utils.run_grep(self.logger, decoded, pattern, behavior))

        return {
            Output.FOUND: output.get('found'),
            Output.HITS: output.get('hits'),
            Output.MATCHES: output.get('matches')
        }
示例#14
0
def to_unicode(data, is_html=False, detwingle=False, verbose=True, lang=None):
    " converts everything to unicode"
    dammit = UnicodeDammit(data, is_html=is_html)
    if detwingle and dammit.original_encoding == 'windows-1252':
        new_data = UnicodeDammit.detwingle(data)
        dammit = UnicodeDammit(new_data, is_html=is_html)

    if verbose:
        sys.stderr.write("Original encoding (via BS): %s\n" %
                         (dammit.original_encoding))

    if lang is None:
        return dammit.unicode_markup

    if lang == 'auto':
        lang = _guess_lang_from_data(dammit.unicode_markup, is_html=is_html)
        if verbose:
            sys.stderr.write("Detected language: %s\n" % (lang))

    return _to_unicode_chared(data, lang, verbose=verbose)
示例#15
0
def to_unicode(data, is_html=False, detwingle=False, verbose=True,
               lang=None):
    " converts everything to unicode"
    dammit = UnicodeDammit(data, is_html=is_html)
    if detwingle and dammit.original_encoding == 'windows-1252':
        new_data = UnicodeDammit.detwingle(data)
        dammit = UnicodeDammit(new_data, is_html=is_html)

    if verbose:
        sys.stderr.write("Original encoding (via BS): %s\n" %
                         (dammit.original_encoding))

    if lang is None:
        return dammit.unicode_markup

    if lang == 'auto':
        lang = _guess_lang_from_data(dammit.unicode_markup, is_html=is_html)
        if verbose:
            sys.stderr.write("Detected language: %s\n" % (lang))

    return _to_unicode_chared(data, lang, verbose=verbose)
def unicode_dammit_example():
    # Install the 'chardet' or 'cchardet' Python libraries for better guesses

    ### Take a string with unknown encoding and make the string Unicode
    weirdass_string = "Sacr\xc3\xa9 bleu!"
    dammit = UnicodeDammit(weirdass_string)
    print "Original Word with weird encoding:", weirdass_string
    print "Dammit Print:", (dammit.unicode_markup)
    print "Dammit Type:", (dammit.original_encoding)

    ### Take a doc with mostly UTF-8 encoding (and misc encodings due to mult
    # data sources) and convert to UTF-8 Unicode with .Dammit.detwingle()
    snowmen = (u"\N{SNOWMAN}" * 3)
    quote = (u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}")
    doc = snowmen.encode("utf8") + quote.encode("windows-1252")
    # So now we have one doc with two encodings in it, printing is a mess
    #print "Weird Decoding doc with utf8:", doc # messed up, won't print
    #print (doc.decode("windows-1252")) # So messed up it doesn't even print

    # Decode using UnicodeDammit.detwingle() converts the string to pure UTF-8
    new_doc = UnicodeDammit.detwingle(doc)
    print new_doc.decode("utf8")
示例#17
0
    def run(self, params={}):
        result = {}
        try:
            eml_file = base64.b64decode(params.get('eml_file')).decode('utf-8')
        except Exception as ex:
            self.logger.debug(ex)
            self.logger.debug(
                "Failed to parse message as UTF-8, attempting to detwingle first before retrying parse"
            )
            eml_file = UnicodeDammit.detwingle(
                base64.b64decode(params.get('eml_file'))).decode(
                    'utf-8', errors='ignore')
        msg = email.message_from_string(eml_file)

        result['date'] = msg['Date']
        result['from'] = msg['From']
        result['to'] = msg['To'] or msg['Delivered-To'] or ''
        if result['to'] == None:
            result['to'] == ''
        if result['to'] == '':
            self.logger.debug("No To address.")
        result['subject'] = msg['Subject']
        bdy = utils.body(msg, self.logger)
        result['body'] = bdy
        atchs = utils.attachments(msg, self.logger)
        result['attachments'] = []
        for a in atchs:
            result['attachments'].append(a)

        parser = email.parser.HeaderParser()
        headers = parser.parsestr(msg.as_string())
        header_list = []
        for h in headers.items():
            header_list.append({'key': h[0], 'value': h[1]})
        result['headers'] = header_list
        self.logger.info("*" * 10)
        self.logger.info({'result': result})
        return {'result': result}
示例#18
0
def ensure_unicode(text: str,
                   most_likely_encodings: Union[str, Iterable[str]] = (),
                   ) -> str:
    if isinstance(most_likely_encodings, str):
        most_likely_encodings = [most_likely_encodings]
    elif isinstance(most_likely_encodings, Iterable):
        most_likely_encodings = list(most_likely_encodings)
    else:
        raise TypeError(most_likely_encodings)

    # decode bytes
    if isinstance(text, (bytes, bytearray)):
        text = UnicodeDammit.detwingle(text)

    # unexpected type, just coerce
    elif not isinstance(text, str):
        text = str(text)

    # convert to unicode
    text = UnicodeDammit(text, most_likely_encodings).unicode_markup

    # ftfy for good measure
    return ftfy.fix_text(text)
示例#19
0
 def getlinks(cfurl):
    r = scraper.get(cfurl, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
    rt = UnicodeDammit.detwingle(r.text)
    html = BeautifulSoup(rt.decode('utf-8'), "html.parser")
    if debug == 1:
       orenc = str(html.original_encoding)
       print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
    bs = html.prettify(formatter=None)
    linkresult = html.findAll('a')
    if len(linkresult) > 0:
       foundlinks = len(linkresult)
       print('\nFOUND %s LINKS AT %s:\n' % (str(foundlinks), cfurl))
       for link in linkresult:
          b = link.get('href')
          b = str(b)
          if b not in cfurl and not re.match(r'^(\.\.)?\/$', b):
             print(b)
       print('')
    else:
       print('\nNO LINKS FOUND.\n')
       foundlinks = 0
    time.sleep(4)
    return foundlinks
示例#20
0
    def to_unicode(data, is_html=False, detwingle=False, verbose=False,
                   lang=None):
        """ Produce unicode from text of unknown encoding.
        Input: bytestring """
        dammit = UnicodeDammit(data, is_html=is_html)
        if detwingle and dammit.original_encoding == 'windows-1252':
            new_data = UnicodeDammit.detwingle(data)
            dammit = UnicodeDammit(new_data, is_html=is_html)

        if verbose:
            sys.stderr.write("Original encoding (via BS): %s\n" %
                             (dammit.original_encoding))

        if lang is None:
            return dammit.unicode_markup

        if lang == 'auto':
            lang = TextSanitizer.guess_lang_from_data(
                dammit.unicode_markup, is_html=is_html)
            if verbose:
                sys.stderr.write("Detected language: %s\n" % (lang))

        return TextSanitizer._to_unicode_chared(data, lang, verbose=verbose)
示例#21
0
    def run(self, params={}):
        data = params.get(Input.DATA)
        try:
            decoded = base64.b64decode(data).decode('utf-8')
        except Exception as ex:
            self.logger.debug("Error decoding")
            self.logger.debug(ex)
            decoded = UnicodeDammit.detwingle(data).decode('utf-8', errors='ignore')

        output = utils.process_grep(
            utils.cat_lines(
                self.logger,
                decoded,
                params.get(Input.PATTERN),
                params.get(Input.BEHAVIOR)
            )
        )

        return {
            Output.FOUND: output.get(utils.FOUND),
            Output.HITS: output.get(utils.HITS),
            Output.MATCHES: output.get(utils.MATCHES)
        }
示例#22
0
    def to_unicode(data, is_html=False, detwingle=False, verbose=False,
                   lang=None):
        """ Produce unicode from text of unknown encoding.
        Input: bytestring """
        dammit = UnicodeDammit(data, is_html=is_html)
        if detwingle and dammit.original_encoding == 'windows-1252':
            new_data = UnicodeDammit.detwingle(data)
            dammit = UnicodeDammit(new_data, is_html=is_html)

        if verbose:
            sys.stderr.write("Original encoding (via BS): %s\n" %
                             (dammit.original_encoding))

        if lang is None:
            return dammit.unicode_markup

        if lang == 'auto':
            lang = TextSanitizer.guess_lang_from_data(
                dammit.unicode_markup, is_html=is_html)
            if verbose:
                sys.stderr.write("Detected language: %s\n" % (lang))

        return TextSanitizer._to_unicode_chared(data, lang, verbose=verbose)
示例#23
0
def unicode_dammit_example():
    # Install the 'chardet' or 'cchardet' Python libraries for better guesses

    ### Take a string with unknown encoding and make the string Unicode
    weirdass_string = "Sacr\xc3\xa9 bleu!"
    dammit = UnicodeDammit(weirdass_string)
    print "Original Word with weird encoding:", weirdass_string
    print "Dammit Print:", (dammit.unicode_markup)
    print "Dammit Type:", (dammit.original_encoding)

    ### Take a doc with mostly UTF-8 encoding (and misc encodings due to mult
    # data sources) and convert to UTF-8 Unicode with .Dammit.detwingle()
    snowmen = (u"\N{SNOWMAN}" * 3)
    quote = (
        u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}"
    )
    doc = snowmen.encode("utf8") + quote.encode("windows-1252")
    # So now we have one doc with two encodings in it, printing is a mess
    #print "Weird Decoding doc with utf8:", doc # messed up, won't print
    #print (doc.decode("windows-1252")) # So messed up it doesn't even print

    # Decode using UnicodeDammit.detwingle() converts the string to pure UTF-8
    new_doc = UnicodeDammit.detwingle(doc)
    print new_doc.decode("utf8")
示例#24
0
def getAndCleanHTML(filepath):

    page = open(filepath, "rb")

    page = page.read()

    clearPage = UnicodeDammit.detwingle(page)

    doc = bs(clearPage, 'lxml')

    for script in doc(["script", "style"]):
        script.extract()

    docText = doc.get_text(' ')

    # é essa aqui que remove os números <------------
    docText = docText.translate({ord(ch): None for ch in '0123456789'})

    docText = docText.lower()

    docText = docText.replace("e-book", "ebook")

    docText = docText.replace("blu-ray", "bluray")

    for symbol in symbolsToReplace:
        docText = docText.replace(symbol, ' ')

    docText = docText.replace("  ", ' ')

    lines = (line.strip() for line in docText.splitlines())

    chunks = (phrase.strip() for line in lines for phrase in line.split(' '))

    text = ' '.join(chunk for chunk in chunks if chunk)

    return text
示例#25
0
## MAIN FILE

my_path = '/Users/lekha/galvanize/capstone/prelims/huskies/data/2015-05-26-Washington/'
all_files = [f for f in os.listdir(my_path) if os.path.isfile(os.path.join(my_path, f))]
data = {}

#files = ['00006.html', '05111108.html', '120394.html', '1bettyevans.html']
#files = ['05111108.html']


files = all_files[1000:]

for html_file in files:
    with open(os.path.join(my_path, html_file)) as f:
        s = str(f.readlines())
        new_s = UnicodeDammit.detwingle(s)
        new_s = new_s.decode("utf-8")
        soup = BeautifulSoup(new_s, 'html.parser')
        summary = extractSummary(soup)
        names = extractName(soup)

        opath = '/Users/lekha/galvanize/capstone/prelims/huskies/data/2015-05-26-Washington/'
        ofile = os.path.join(opath, "output0.txt")
        #printSummaryRows(summary, opath)
#        printPhotoRows(photos, opath)
#        printSkillRows(skills, opath)

        # soup = BeautifulSoup(s, 'html.parser')
        # full_name = soup.find('span', {'class': 'full-name'})
        # summary = soup.find('div', {'class':'summary'})
        # if full_name:
示例#26
0
def soup_in(filename):
    return BeautifulSoup(
        UnicodeDammit.detwingle(open(filename).read()).decode('utf8'))
示例#27
0
 def process_txt(self, fileobj):
     return UnicodeDammit.detwingle(fileobj.read())
示例#28
0
文件: s.py 项目: lite/MyTestBox
 def slim_html(self, raw_html):
     doc = UnicodeDammit.detwingle(raw_html)
     soup = BeautifulSoup(doc, "html5lib", from_encoding="utf-8")
     return soup.prettify().encode("utf-8");
示例#29
0
 def process_txt(self, fileobj):
     return UnicodeDammit.detwingle(fileobj.read())
示例#30
0
	def decode_html(self, html_string):
		new_doc = UnicodeDammit.detwingle(html_string)
		return new_doc.decode("utf-8")
示例#31
0
print(UnicodeDammit(markup, ['windows-1252']).unicode_markup)

# 矛盾的编码
snowmen = (u"\N{SNOWMAN}" * 3)
quote = (
    u"\N{LEFT DOUBLE QUOTATION MARK}I like snowmen!\N{RIGHT DOUBLE QUOTATION MARK}"
)
doc = snowmen.encode("utf8") + quote.encode("windows_1252")

print(doc)
print(doc.decode("windows-1252"))

# 使用UnicodeDammit.detwingle()方法把这段字符串转换为UTF-8编码,
# 允许我们同时显示文档中的snowmen和引号,
# 在创建BeautifulSoup或UnicodeDammit对象前,先对文档调用UnicodeDammit.detwingle确保文档编码正确
new_doc = UnicodeDammit.detwingle(doc)
print(new_doc.decode("utf-8"))

## 解析部分文档  SoupStrainer
from bs4 import SoupStrainer

only_a_tags = SoupStrainer('a')
print(only_a_tags)

only_tags_with_id_link2 = SoupStrainer(id='link2')
print(only_tags_with_id_link2)


def is_short_string(string):
    return len(string) < 10
示例#32
0
# Conectando ao MongoDB
try:
    conn=pymongo.MongoClient()
    print "\nConectado com sucesso ao MongoDB!"
except pymongo.errors.ConnectionFailure, e:
   print "\nNão foi possível conectar ao MongoDB: %s" % e 

db = conn.mydb

for cont in range(1, 3):
	url = "http://omelete.uol.com.br/busca/?q=the+walking+dead"
	opener = urllib2.build_opener()
	opener.addheaders = [('User-agent', 'Mozilla/5.0')]
	documento = opener.open(url)
	documento = UnicodeDammit.detwingle(documento.read())
	soup = BeautifulSoup(documento)

	paginas = soup.body.find("div", {"class": "pagination"})
	paginas = paginas.findAll('a')
	paginas = paginas[len(paginas)-2]
	paginas = paginas.text
	paginas = int(paginas)

	i = 0
	for i in range(1, paginas+1):
		if(i==1):
			url = "http://omelete.uol.com.br/busca/?q=the+walking+dead"
		else:
			url = "http://omelete.uol.com.br/busca/?page="+ str(i) +"&q=the%20walking%20dead"
		opener = urllib2.build_opener()
示例#33
0
   def followlinks(bx):
      p = urlparse(bx)
      if '/' not in p.path[-1:]:
         part = p.path.split('/')[-1]
         path = p.path.rstrip(part)
      else:
         path = p.path
      if '/' not in path[:1]:
         path = '/' + path
      urlfqdn = p.scheme + '://' + p.netloc
      parent = urlfqdn + path + '/'
      s = scraper.get(bx, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
      print('\n----------------------------------------------------------- \n')
      print(s)
      print('\n')
      scr = UnicodeDammit.detwingle(s.text)
      shtml = BeautifulSoup(scr, "html.parser")
      if debug == 1:
         orenc = str(shtml.original_encoding)
         print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
      print('\n----------------------------------------------------------- \n')
      sfindlinks = shtml.findAll('a')
      slen = len(sfindlinks)
      sdirs = []
      si = 0
      while si < slen:
         for slink in sfindlinks:
            if debug == 1:
               print('\n\033[34;1mSLINK LOOP\r\n\033[32;21m* si = %d, si < %d\033[0m\n' % (si, slen))
            sl = slink.get('href')
            si += 1
            if sl:
               if not re.search(r'^((\.\.)?\/)$', str(sl)):
                  if '/' in bx[-1:]:
                     if 'http' not in sl[:4]:
                        sl = sl.lstrip('/')
                        sx = bx + sl
                     else:
                        sx = sl
                     print(sx)
                     getCF(sx, 0)
                     ss = scraper.get(sx, stream=True, verify=False, proxies=proxystring, allow_redirects=True)
                     bs = BeautifulSoup(ss.text, "html.parser")
                     if bs is not None:                        
                        if debug == 1:
                           orenc = str(bs.original_encoding)
                           print('\n\033[40m\033[35;1mORIGINAL ENCODING: %s \033[0m\n' % orenc)
                        pagehead = bs.html.head.contents
                        pagehead = str(pagehead)
                        if pagehead:
                           pagetitle = re.search(r'<title>(.*)<\/title>', pagehead)
                           pagetitle = str(pagetitle.group(1))
                           bigtitle = pagetitle.upper()
                           titlestars = lambda a: '*' * (len(str(a)) + 4)
                           pagestars = titlestars(pagetitle)
                           print('\n\033[40m\033[33m%s\n\033[34;1m* %s * \n\033[40m\033[33;21m%s\n\033[0m' % (pagestars, bigtitle, pagestars)) 
                     sb = bs.find_all('a', href = re.compile(r'.+$'))
                     #sb = bs.findAll('a')
                     sblen = len(sb)
                     if sblen > 0:
                        n = 0
                        while n < sblen:
                           for sbl in sb:
                              if debug == 1:
                                 print('\n\033[35;1mSBL LOOP\r\n\033[37;21m* n = %d, n < %d \033[0m\n' % (n, sblen))
                              if sbl is not None:
                                 sr = sbl.get('href').strip()
                                 sr = str(sr)
                                 print('\n* %s \n') % sr
                                 if not re.search('http', sr[:4]):
                                    parent = getparent(sx)
                                    srs = sr.lstrip('/')
                                    sr = parent + srs
                                 if re.match(r'([^.]+\/)$', str(sr)):
                                    followlinks(sr)
                                    sdirs.append(sr)
                                 else:
                                    if '/' not in sr[-1:]:
                                       getCF(sr, 0)
                                       sdirs.append(sr)
                                 n += 1
                              else:
                                 n += 1
                                 continue

                  elif 'Error-222' in bx:
                     print('\nuh-oh. might have triggered a flag with cloudflare.\n')
                     for i in xrange(10,0,-1):
                        time.sleep(1)        
                        print('delaying request for %d seconds.. \r' % i)
                        sys.stdout.flush()
                     break
                  else:
                     if not re.search('http', str(sl[:4])):
                        parent = getparent(bx)
                        sl = sl.lstrip('/')
                        sx = parent + sl
                     else:
                        sx = str(sl)

                  sx = str(sx)
                  sdirs.append(sx)
                  print(sx)
                  print('\n----------------------------------------------------------- \n')              
                  getCF(sx, 0)
               si += 1

               #if re.search(r'^(.*)(\/)$', str(bx)):
            else:
               print('\nno links found at %s \n' % str(slink))
               si += 1
               continue

      for sd in sdirs:
         if '/' in sd[-1:]:
            print('\nfollowing directory: %s \n' % sd)
            followlinks(sd)
            getCF(sd, 1)
         else:
            print('\nrequesting link: %s \n' % sd)
            getCF(sd, 0)
      return sdirs
示例#34
0
def get_data(site_code):

    url = config.get(
        'DEFAULTS',
        'weather_data_url_prefix') + '/' + site_code.upper() + config.get(
            'DEFAULTS', 'weather_data_url_file_extension')

    logger.debug('retrieval url: %s' % (url))

    # Make soup
    try:
        resp = urlopen(url)

        LastRetrieval = datetime.strptime(resp.headers['Date'],
                                          '%a, %d %b %Y %H:%M:%S %Z')
        LastModified = datetime.strptime(resp.headers['Last-Modified'],
                                         '%a, %d %b %Y %H:%M:%S %Z')

        logger.debug('web page timestamp: Last-Modified: ' +
                     resp.headers['Last-Modified'])

        contents = resp.read()
        new_contents = UnicodeDammit.detwingle(contents)
        soup = BeautifulSoup(new_contents, "html.parser")

    except URLError as e:
        logger.warn('An error occurred fetching data\n\t%s\n\t%s' %
                    (url, e.reason))
        return {}

    # Get table
    try:
        tables = soup.findAll("table")
        table = tables[3]
    except AttributeError as e:
        logger.warn('No tables found, exiting' % (url, e.reason))
        return 1
    except LookupError as e:
        logger.warn('there is no index table[3] on the page for ' + url)
        return 1
    except IndexError as e:
        logger.warn('there is no index table[3] on the page for ' + url)
        return 1

    # Get rows
    try:
        rows = table.find_all('tr')
    except AttributeError as e:
        logger.warn('No table rows found, exiting' % (url, e.reason))
        return 1

    # first two columns are created from the table
    table_columns = out_file_columns[3:len(out_file_columns)]

    # Get data
    table_data = parse_rows(rows)

    # prepare the data read from the web page
    today = datetime.now()
    month = today.month
    year = today.year
    monthedge = 0

    data_rows = {}
    for i in table_data:

        data = dict(zip(table_columns, i))

        day = data['Date']

        # this gets over month/year edges.
        if int(day) <= 2 and monthedge == 0:
            monthedge = 1

        hour, minute = data['Time'].split(':')

        my_month = -1

        # this gets over month/year edges.
        if int(day) > 2 and monthedge == 1:
            my_month = month - 1  # the month is coming from 'localtime' not the webpage
            if my_month == 0:  # january fix
                my_month = 12
                year = year - 1
        else:
            my_month = month

        obs_datetime = datetime(year, my_month, int(day), int(hour),
                                int(minute))

        data['site_code'] = site_code.upper()
        data['DateTime'] = obs_datetime.strftime('%Y-%m-%d %H:%M:00')
        data['TIMESTAMP'] = 'TS:' + data['DateTime']

        # these fields are stored in the database as numbers, but the web pages use 'NA' for missing data.  that string needs to be replaced with None
        check_field_values = ['AirTemp', 'Dewpoint', 'AirPressureAltimeter']
        for field in check_field_values:
            if data[field] == 'NA':
                data[field] = None
            elif not data[field]:
                data[field] = None

        data_rows[data['TIMESTAMP']] = data

    return [LastRetrieval, LastModified, data_rows]
示例#35
0
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<b><!-- i am a comment --></b>

<p class="story">...</p>
"""

# 编码监测 可设置可能编码
dammit = UnicodeDammit(html_doc, ["utf8", "gbk"])
if 0:
    print dammit.original_encoding

# 转换掉字符中的Windows-1252字符
html_doc = UnicodeDammit.detwingle(html_doc)

# 文档解析过程
# diagnose(html_doc)

# 解析部分文档  提高效率
only_a_tag = SoupStrainer("a")

# html_doc 可以是 文件对象或字符串
soup = BeautifulSoup(html_doc, features=["lxml"], from_encoding='utf8')
'''
:param: features=[] 解析器列表
:param: from_encoding='utf8' 编码
:param: parse_only SoupStrainer 实例
'''
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<b><!-- i am a comment --></b>

<p class="story">...</p>
"""

# 编码监测 可设置可能编码
dammit = UnicodeDammit(html_doc, ["utf8", "gbk"])
if 0:
    print dammit.original_encoding

# 转换掉字符中的Windows-1252字符
html_doc = UnicodeDammit.detwingle(html_doc)

# 文档解析过程
# diagnose(html_doc)


# 解析部分文档  提高效率
only_a_tag = SoupStrainer("a")

# html_doc 可以是 文件对象或字符串
soup = BeautifulSoup(html_doc, features=["lxml"], from_encoding='utf8')
'''
:param: features=[] 解析器列表
:param: from_encoding='utf8' 编码
:param: parse_only SoupStrainer 实例
'''
示例#37
0
文件: irc.py 项目: lae/Servrhe
def normalize(s):
    try:
        u = UnicodeDammit.detwingle(s).decode("utf8")
    except:
        u = UnicodeDammit(s, ["utf8", "windows-1252"]).unicode_markup
    return u
示例#38
0
def extract(paginas, directorio, prefijoArchivo, nombreArchivo):
    from bs4 import BeautifulSoup, Tag, UnicodeDammit
    import re, os

    noEsPuntoFinal = re.compile('[^.?:\']$')
    textoImagen = re.compile(r'(Table)|(Figure)\D+(\xa0)*\d+.')
    esSeccion = re.compile(r'[0-9].\s\'?[A-Z]')
    esSubseccion = re.compile(r'\([a-z]\)\s+[A-Z]')
    esCapitulo = re.compile(r'PART\s[A-Z]+')
    mydir = os.path.abspath(os.path.dirname(__file__))
    parrafo = ''
    esParrafo = False
    sigueImagen = False

    with open(directorio + prefijoArchivo + nombreArchivo + ".html",
              'w') as cap:
        cap.write('<?xml version="1.0" encoding="UTF-8"?>\n')
        cap.write('<html xmlns="http://www.w3.org/1999/xhtml">\n')
        cap.write('<head>\n')
        cap.write(
            '<meta http-equiv="content-type" content="application/xhtml+xml; charset=UTF-8" />\n'
        )
        cap.write(
            '<title>Decision and Control: The meaning of Operational Research and Management Cybernetics</title>\n'
        )
        cap.write(
            '<link rel="stylesheet" href="SBDecisionandControl.css" type="text/css" />\n'
        )
        cap.write('</head>\n')
        cap.write('<body style="margin:5px; padding: 5px;">\n')

        # Para el manejo de las imagenes, se cambia el padre por <font> a excepción de que sea <i>
        for i in paginas:
            with open(
                    os.path.join(mydir, '..', '1995StaffordBeer', 'OEBPS',
                                 'oldData', 'page_' + str(i) + ".xml")) as sf1:
                soup = BeautifulSoup(sf1, 'html.parser')
                for img in soup.find_all('img'):
                    src = img['src']
                    img['src'] = 'images/' + src
                    parent = img.parent
                    width = img['width']
                    if (parent.name == 'td'):
                        parent.name = 'font'
                        parent['face'] = 'imagen'
                    if (parent.parent.name == 'font'):
                        parent.parent['size'] = '6'
                    if (float(width) > 100):
                        parent['size'] = '6'
                    else:
                        parent['size'] = '3'
                    # print(img.parent.parent)

                # cadena = '<td><font face="Times New Roman, Times, Serif" size="3">We are concerned with times, which are numbered consecutively 0, 1, 2, 3, 4 and so on. Typically, the time is t, and the time before it <i>t-1;</i> so the gap between them prescribes a basic interval, typically the <i>t</i>th interval. There is a range of activities which could occur in this interval, and a range of items which could be manufactured too. These could all be nominated by a string of consecutive numbers as well; but typically there is a jth activity and an ith item. Now if an increment a is defined as an addition to the cumulative flow function through the works, it must relate to a certain time interval, and it may pair off any item with any activity. The whole range of such possibilities is written </font><font face="Symbol" size="3"><i>a</i></font><i><font face="Times New Roman, Times, Serif" size="1"><sub>ij</sub></font><sub><font face="Times New Roman, Times, Serif" size="2">(t)</font></sub></i><sub><font face="Times New Roman, Times, Serif" size="2"></font></sub><sub><font face="Times New Roman, Times, Serif" size="2"></font><font face="Times New Roman, Times, Serif" size="3"> Similarly, a decrement <img src="6b58e80766105c011135240bb26d95b7.gif" border="0" alt="C0159-01.gif" width="10" height="12" /> is defined as subtracting from the cumulative flow; its particulars are specified in the same way. The first is an input coefficient, occurring at the end of the interval. The unknown, x (or series <i>of xs),</i> which must be calculated is in this case the number of units of each item that must be produced by each activity. The following expression accounts precisely for the equilibrial condition (that is, that the input to and the output from the system must match) of the dynamic system described above:</font></sub></td>'
                # soup = BeautifulSoup(cadena, 'html.parser')

                for data in soup.find_all('font'):
                    if (data['size'] == "2" and data.get_text()[0:4]
                            == "Page"):  # Para eliminar el dato de páginas
                        # cap.write(str(data) + "\n")
                        continue
                    elif (data['size'] == "0"
                          ):  # para eliminar los saltos entre parrafo
                        continue
                    elif (data['size'] == "2" and textoImagen.match(
                            UnicodeDammit.detwingle(data.get_text()))
                          # (textoImagen.match(data.get_text()))
                          ):  # para las tablas
                        data.name = "p"
                        data['class'] = "center"
                        del data['size']
                        del data['face']
                        cap.write(str(data) + "\n")
                        sigueImagen = False
                        continue
                    elif (data['size'] == "2" and data.contents
                          == []):  # para los superindices vacios
                        continue
                    elif (data['size'] == "2"
                          and isinstance(data.contents[0], Tag)
                          and data.contents[0].name
                          == 'sup'):  # para los superindices
                        linea = ''
                        for child in data.children:
                            linea = linea + str(child)
                        parrafo = parrafo + linea
                    elif (data['size'] == "1" and data.contents
                          == []):  # para los subindices vacios
                        continue
                    elif (data['size'] == "1"
                          and isinstance(data.contents[0], Tag)
                          and data.contents[0].name
                          == 'sub'):  # para los subindices
                        linea = ''
                        for child in data.children:
                            linea = linea + str(child)
                        parrafo = parrafo + linea
                    elif (data['size'] == "2" and data.parent.name
                          == 'sub'):  # para superindices solos
                        data.name = "sub"
                        del data['size']
                        del data['face']
                        linea = str(data)
                        # for child in data.children:
                        #    linea = linea + str(child)
                        parrafo = parrafo + linea
                    elif (data['size'] == "2"):  # para las tablas
                        data.name = "p"
                        data['class'] = "bl_extract"
                        del data['size']
                        del data['face']
                        cap.write(str(data) + '<br/>' + "\n")
                        # cap.write(data.get_text() + "\n")
                    elif (data["size"] == "6"
                          ):  #Para obtener las imagenes preprocesadas
                        data.name = "p"
                        data['class'] = "center"
                        del data['size']
                        del data['face']
                        cap.write(str(data) + "\n")
                    elif (data["size"] == "3" and esSeccion.match(
                            data.get_text())):  # Para los subtitulos
                        data.name = "h2"
                        data['class'] = "section"
                        del data['size']
                        del data['face']
                        cap.write(str(data) + "\n")
                    elif (data["size"] == "3" and esSubseccion.search(
                            data.get_text())):  # Para los subsecciones
                        data.name = "h3"
                        data['class'] = "section"
                        del data['size']
                        del data['face']
                        cap.write(str(data) + "\n")
                    elif (data["size"] == "3" and data['face'] == 'Symbol'):
                        if (esParrafo):
                            # linea = data.get_text()
                            linea = str(data)
                            parrafo = parrafo + linea
                            print('Symbol: ' + data.get_text())
                        else:
                            cap.write('Symbol: ' + str(data) + "\n")
                            print('Symbol: ' + data.get_text())
                            continue
                    elif (data["size"] == "3" and data.contents == []):
                        continue
                    elif (data["size"] == "3"
                          and esParrafo):  # Para unir los parrafos
                        data.name = "p"
                        data['class'] = "indent"
                        del data['size']
                        del data['face']
                        if (noEsPuntoFinal.search(data.get_text().rstrip())):
                            linea = ''
                            for child in data.children:
                                linea = linea + str(child)
                            parrafo = parrafo + linea + " "
                            # cap.write("Entre Parrafo: " + str(parrafo) + "\n")
                        else:
                            esParrafo = False
                            linea = ""
                            for child in data.children:
                                linea = linea + str(child)
                            parrafo = parrafo + linea
                            cap.write('<p class="indent">' + str(parrafo) +
                                      "</p>\n")
                            parrafo = ''
                    elif (data["size"] == "3"):  # Para los parrafos del libro
                        data.name = "p"
                        data['class'] = "indent"
                        del data['size']
                        del data['face']
                        if noEsPuntoFinal.search(data.get_text().rstrip()
                                                 ):  # Busca si es punto final
                            esParrafo = True
                            linea = ""
                            for child in data.children:
                                linea = linea + str(child)
                                # cap.write("Children: " + str(child) + "\n")
                            parrafo = linea + " "
                            # cap.write("Inicio Parrafo: "+str(parrafo) + "\n")
                            continue
                        cap.write(str(data) + "\n")
                    elif (data['size'] == "4"
                          ):  # Para los títulos y encabezados
                        data.name = "h1"
                        if (esCapitulo.match(data.get_text())):
                            data['class'] = 'chapter1'
                        else:
                            data['class'] = 'section'
                        del data['size']
                        del data['face']
                        cap.write(str(data) + "\n")
                    else:  # Por si pasa algo
                        data.name = "p"
                        cap.write(str(data) + "\n")

        cap.write('</body>\n')
        cap.write('</html>\n')
示例#39
0
 def decode_html(self, html_string):
     new_doc = UnicodeDammit.detwingle(html_string)
     return new_doc.decode("utf-8")
示例#40
0
                ## Grab the article XML file ##
                ###############################
                xmlFile = os.path.join(root, name)
                print("Processing " + xmlFile)
                log.add_msg("Processing " + xmlFile)
                log.countArticles += 1

                ###############################
                ## Grab the article metadata ##
                ###############################

                # Read the XML
                f = open(xmlFile)
                #xmlStr = UnicodeDammit(f.read())
                #tree = BeautifulSoup(xmlStr.unicode_markup,"lxml")
                rawtext = UnicodeDammit.detwingle(f.read())
                tree = BeautifulSoup(rawtext.decode('utf-8', 'ignore'), "xml")
                #tree = BeautifulSoup(f.read(),"lxml")
                f.close()
                #print tree.prettify()

                #############################################
                ## Process NLM or JATS-formatted XML files ##
                #############################################
                if tree.find('front'):  # NLM or JATS formatted XML
                    fmt = "NLM"
                    # Read the first three elements and create the article object
                    try:
                        doi = tree.front.find('article-id', {
                            'pub-id-type': 'doi'
                        }).text