示例#1
0
def gen_slogan(msg):
    html = get('http://www.sloganizer.net/en/outbound.php', params={'slogan': msg})
    slogan = re.search('>(.*)<', html.text).group(1)
    parser = HTMLParser()
    slogan = parser.unescape(parser.unescape(slogan))
    slogan = slogan.replace('\\', '').strip()
    return slogan if slogan else gen_slogan(msg)
def parse_videos_from_feed():
    """
    Injest MRSS feed into local scope; format videos to FB upload spec
    """
    data = feedparser.parse(os.getenv('MTFV_MRSS_URL'))
    h = HTMLParser()
    videos = []
    for video in data.entries:
        if get_value(video['guid']):
            continue
        formatted_video = {
            'title': h.unescape(video['title']),
            'description': h.unescape(video['summary']),
            'guid': video['guid'],
            'file_url': video['media_content'][0]['url'],
            'file_size': video['media_content'][0]['filesize'],
            'thumb_url': video['media_thumbnail'][0]['url']
        }
        if os.getenv('MRSS_SCHEDULED_DATETIME_ELEMENT') and video[os.getenv('MRSS_SCHEDULED_DATETIME_ELEMENT')]:
            formatted_video['published'] = 0
            formatted_video['unpublished_content_type'] = 'SCHEDULED'
            datetime = parse(video[os.getenv('MRSS_SCHEDULED_DATETIME_ELEMENT')])
            formatted_video['scheduled_publish_time'] = datetime.strftime('%s')
        videos.append(formatted_video)
    return videos
示例#3
0
 def parseHTML(self,string,timezoneoffset):
     h = HTMLParser()
     anime_showtimes = {}
     start = False
     for line in string:
         try:
             if not line.startswith('<h2 class="weekday">') and not start:
                 pass
             elif re.search(r'<td class="schedule-page-show">.*>(.*)</a>', line) != None:
                 title = h.unescape(re.search(r'<td class="schedule-page-show">.*>(.*)</a>', line).group(1))
             elif re.search(r'<td class="schedule-show">(.*)</td>', line) != None:
                 title = h.unescape(re.search(r'<td class="schedule-show">(.*)</td>', line).group(1))
             elif re.search(r'<h2 class="weekday">(.*)</h2>',line) != None:
                 weekday = h.unescape(re.search(r'<h2 class="weekday">(.*)</h2>',line).group(1))
                 if weekday == 'To be scheduled':
                     break
                 elif not start:
                     start = True
                 for i,a in enumerate(['Sunday','Monday','Tuesday','Wednesday','Thursday','Friday','Saturday']):
                     if weekday == a:
                         weekday = i
                         break
             elif re.search(r'<td class="schedule-time">(\d\d:\d\d)</td>',line) != None:
                 time = h.unescape(re.search(r'<td class="schedule-time">(\d\d:\d\d)</td>',line).group(1))
                 time_object = datetime.datetime(2016,5,8+weekday,int(time[0:2]),int(time[3:5])) + datetime.timedelta(hours=int(timezoneoffset))
                 anime_showtimes[title.lower()] = [time_object.isoweekday(),time_object.hour,time_object.minute,title]
         except Exception as e:
             print(str(e))
     return anime_showtimes
示例#4
0
    def handle_task(self, job):
        user = job.get("user", "root")
        group = job.get("group", "root")
        mail = job.get("sender", None)

        account = Account(user=user, group=group, mail=mail)

        recipients = job.get("recipients", None)
        subject = ensure_unicode(job.get("subject", ""))
        body = ensure_unicode(job.get("body", ""))
        attachments = job.get("attachments", None)
        smtp_host = job.get("smtp_host", "localhost")
        smtp_port = job.get("smtp_port", 25)
        html = job.get("html", False)

        template_data = job.get("jobctx", {})
        body = Template(body)(template_data)
        subject = Template(subject)(template_data)

        if not html:
            h = HTMLParser()
            body = h.unescape(body)
            subject = h.unescape(subject)

        # Execute the task
        return self.sendmail(account, recipients, subject, body, attachments, smtp_host, smtp_port, html)
示例#5
0
    def handle_task(self, job):
        user = job.get('user', 'root')
        group = job.get('group', 'root')
        mail = job.get('sender', None)

        account = Account(user=user, group=group, mail=mail)

        recipients = job.get('recipients', None)
        subject = ensure_unicode(job.get('subject', ''))
        body = ensure_unicode(job.get('body', ''))
        attachments = job.get('attachments', None)
        smtp_host = job.get('smtp_host', 'localhost')
        smtp_port = job.get('smtp_port', 25)
        html = job.get('html', False)

        template_data = job.get('jobctx', {})
        body = Template(body)(template_data)
        subject = Template(subject)(template_data)

        if not html:
            h = HTMLParser()
            body = h.unescape(body)
            subject = h.unescape(subject)

        # Execute the task
        return self.sendmail(
            account, recipients, subject, body, attachments, smtp_host,
            smtp_port, html)
示例#6
0
 def _get_track_name(self, t_data):
     html_parser = HTMLParser()
     full_name = "{0} - {1}".format(
         html_parser.unescape(t_data['artist'])[:50].strip(),
         html_parser.unescape(t_data['title'])[:50].strip(),
     )
     full_name = re.sub('[' + FORBIDDEN_CHARS + ']', "", full_name)
     full_name = re.sub(' +', ' ', full_name)
     return full_name
示例#7
0
def get_quiz():
    h = HTMLParser()
    random.seed()
    quiz_type = random.choice([0, 1])
    quiz_up = from_group(1) + from_group(2) + from_group(3)
    random.shuffle(quiz_up)
    parent_body = lambda c: h.unescape(c.parent.body_html) if c.parent != None else c.submission.title
    p_b = lambda c: parent_body(c) if quiz_type == 1 else ''
    t_d = lambda c: str(c.created_utc - c.submission.created_utc) if quiz_type == 1 else ''
    quiz = {'quiz': [{'body': h.unescape(c.body_html), 'comment_id': c.c_id, 'parent_body': p_b(c), 'time_diff': t_d(c)} for c in quiz_up], 'type': quiz_type}
    return json.dumps(quiz)
def get_track_full_name(data):
    """
        Gets track full name and convert it to string like 'artist_track.mp3' 
    """
    forbidden_symbols = ',!.;/'
    html_parser = HTMLParser()
    full_name = u"{0}_{1}".format(
        html_parser.unescape(data['artist'][:100]).strip(),
        html_parser.unescape(data['title'][:100]).strip(),
    )
    full_name = ''.join(c for c in full_name if not c in forbidden_symbols)
    return full_name + ".mp3"
示例#9
0
def editf():
    fn=request.args.get('filename')
    if os.path.exists('mdfile/'+fn)==False:
        editer={"filename":fn,"content":"# New_File.md"}
        h=HTMLParser()
        return h.unescape(render_template("tmpl/model/edit.html",editer = editer))
    else:
        f=open('mdfile/'+fn)
        tx=f.read()
        f.close()
        editer={"filename":fn,"content":tx}
        h=HTMLParser()
        return h.unescape(render_template("tmpl/model/edit.html",editer = editer))
示例#10
0
def gowiki():
    nm=request.args.get('file')
    if os.path.exists('mdfile/'+nm)==False:
        wiki={'title':"Page Not Found",'content':"# Page Not Found"}
        h=HTMLParser()
        return h.unescape(render_template("tmpl/model/model.html",wiki = wiki))
    else:
        f=open('mdfile/'+nm)
        ct=f.read()
        f.close()
        wiki={'title':nm,'content':ct}
        h=HTMLParser()
        return h.unescape(render_template("tmpl/model/model.html",wiki = wiki))
示例#11
0
    def _parse_article(self, div):
        self.article = Article()

        parser = HTMLParser()

        for tag in div:
            if not hasattr(tag, 'name'):
                continue
            if tag.name == 'div' and self._tag_has_class(tag, 'gs_ri'):
                rt = tag.find('h3', {'class': 'gs_rt'})
                if rt:
                    ctu = rt.find('span')
                    if ctu:
                      ctu.extract()
                    self.article['title'] = parser.unescape(''.join(rt.findAll(text=True)).strip())
                    if rt.a:
                      self.article['url'] = self._path2url(rt.a['href'])

                if tag.find('div', {'class': 'gs_a'}):
                    year = self.year_re.findall(tag.find('div', {'class': 'gs_a'}).text)
                    self.article['year'] = year[0] if len(year) > 0 else None

                if tag.find('div', {'class': 'gs_fl'}):
                    self._parse_links(tag.find('div', {'class': 'gs_fl'}))

                if tag.find('div', {'class': 'gs_rs'}):
                    self.article['summary'] = tag.find('div', {'class': 'gs_rs'}).text

        if self.article['title']:
            self.handle_article(self.article)
示例#12
0
def get_game_list (system):
	"""List all the games on Guardiana for a given system."""
	
	response = urllib.request.urlopen ("http://www.guardiana.net/MDG-Database/Complete-List/" + system + "/")
	
	doc = response.read ()
	
	soup = BeautifulSoup(doc)
	html_game_list = soup.find("div", {"id": "MDGD_FullList_Box"})
	
	game_list = re.findall ("""» <a href="(.+?)">(.+?)</a><br/>(?:<em>)?(.*?)(?:</em>)?<br/>""", str (html_game_list))
	
	game_dict_list = []
	
	for game in game_list:
		game_dict = {'url': "http://www.guardiana.net" + game[0], 'title': [ ]}
		
		# Clean up the URL and add it
		result = re.search ("(.*?)\?PHPSESSID=.*?", game[0])
		if result:
			game_dict['url'] = "http://www.guardiana.net" + result.group(1)
		else:
			game_dict['url'] = "http://www.guardiana.net" + game[0]
		
		# Unescape the HTML entities from titles and add them
		pars = HTMLParser()
		game_dict['title'].append (pars.unescape (game[1]))
		game_dict_list.append (game_dict)
	
	return game_dict_list
示例#13
0
    def linksh(self, cli, ev):
        try:
            self.chancache[ev.target.lower()]
        except:
            return 1
        if self.yt is True:
            yr = re.compile(".*(youtube\.com\/watch\?.*v=|youtu\.be\/)([A-Za-z"
                                                    "0-9._%-]*)[&\w;=\+_\-]*.*")
            res = yr.search(ev.arguments[0])
            if res is not None:
                self.ytlinks(cli, ev, res)
                return 0
        url = re.compile("((https?):((\/\/)|(\\\\))+[\w\d:#@%\/;$()~_?\+-=\\\."
                                                                        "&]*)")
        res = url.search(ev.arguments[0])
        if res is None:
            return 1
        uri = res.group(1)
        r = urllib.request.urlopen(uri).read().decode('utf-8', 'replace')
        parser = HTMLParser()
        r = parser.unescape(r)
        yr = re.compile(".*<title[^>]*>([^<]+)</title>.*")
        title = yr.search(r)
        if title is None:
            return 1

        cli.msg(ev.target, title.group(1))
示例#14
0
def _get_springer_journal_stats(journal_id, period, oa=False):
    if not journal_id.isdigit():
        raise ValueError("Invalid journal id " + journal_id + " (not a number)")
    url = SPRINGER_FULL_SEARCH.format(journal_id, period, period)
    if oa:
        url = SPRINGER_OA_SEARCH.format(journal_id, period, period)
    print(url)
    try:
        req = Request(url, None)
        response = urlopen(req)
        content = response.read()
        content = content.decode("utf-8")
        results = {}
    except HTTPError as httpe:
        if httpe.code == 503: # retry on timeout
            print(colorise("Timeout (HTTP 503), retrying...", "yellow"))
            return _get_springer_journal_stats(journal_id, period, oa)
        else:
            raise httpe
    count_match = SEARCH_RESULTS_COUNT_RE.search(content)
    if count_match:
        count = count_match.groupdict()['count']
        count = count.replace(",", "")
        results['count'] = int(count)
    else:
        raise ValueError("Regex could not detect a results count at " + url)
    title_match = SEARCH_RESULTS_TITLE_RE.search(content)
    if title_match:
        title = (title_match.groupdict()['title'])
        htmlparser = HTMLParser()
        results['title'] = htmlparser.unescape(title)
    else:
        raise ValueError("Regex could not detect a journal title at " + url)
    return results
示例#15
0
    def publishPost(self, post, link, comment):
        logging.info("    Publishing in Telegram...")
        bot = self.tc
        title = post
        content = comment
        links = ""
        channel = self.channel

        from html.parser import HTMLParser
        h = HTMLParser()
        title = h.unescape(title)
        text = '<a href="'+link+'">'+title+ "</a>\n" + content + '\n\n' + links
        textToPublish2 = ""
        if len(text) < 4090:
            textToPublish = text
            links = ""
        else:
            text = '<a href="'+link+'">'+title + "</a>\n" + content
            textToPublish = text[:4080] + ' ...'
            textToPublish2 = '... '+ text[4081:]

        logging.info("text to "+ textToPublish)
        logging.info("text to 2"+ textToPublish2)

        bot.sendMessage('@'+channel, textToPublish, parse_mode='HTML') 
        if textToPublish2:
            try:
                bot.sendMessage('@'+channel, textToPublish2[:4090], parse_mode='HTML') 
            except:
                bot.sendMessage('@'+channel, "Text is longer", parse_mode='HTML') 
        if links:
            bot.sendMessage('@'+channel, links, parse_mode='HTML') 
def forwards(apps, schema_editor):
    html_parser = HTMLParser()

    for cascade_element in CascadeElement.objects.all():
        if cascade_element.plugin_type != 'CarouselSlidePlugin':
            continue

        caption = cascade_element.glossary.get('caption')
        if not caption:
            continue

        text_element = add_plugin(cascade_element.placeholder, TextPlugin, cascade_element.language,
                                  target=cascade_element)

        old_body = html_parser.unescape(caption)
        new_body, count = _replace_text_body(
            old_body,
            input_pattern=r'<img ([^>]*)\bid="plugin_obj_(?P<pk>\d+)"([^>]*)/?>',
            output_tag='<cms-plugin {}></cms-plugin>',
            id_format='id="{}"',
        )
        text_element.body = new_body
        text_element.save()

        # TODO: need to be re-tested
        if False and count > 0:
            for link_element in CMSPlugin.objects.filter(parent_id__in=(cascade_element.id, cascade_element.parent_id), plugin_type='TextLinkPlugin'):
                # print("Move Link {} from {} -> {}".format(link_element.id, link_element.parent_id, text_element.id))
                link_element.move(text_element, pos='last-child')
                link_element.save()
示例#17
0
def get_images(current_title, title, titles_length):
    h = HTMLParser()
    print("Fetching images from %s... (%s/%s)" % (title, current_title + 1, titles_length))
    # Escape the title so we can create a valid link
    # title = title.replace('\'', '%27').replace(' ', '%20')
    # Repition is succes
    while True:
        try:
            page = urlopen(SOURCE_LOCATION % title).read().decode(ENCODING)
            break
        except IOError:
            print("\tServer's being lazy, retrying...")

    if not page:
        print("\tFailed to get %s's images!" % title)
        return []
    # Ignore redirects
    if search("#DOORVERWIJZING", page, I | M) is not None or search("#REDIRECT.*", page, I | M) is not None:
        print("\tSkipping redirecting page %s" % title)
        return []
    imagelinks = []
    parser = ImageLocater(imagelinks)

    page = h.unescape(page)

    try:
        parser.feed(page)
    except:
        print("%s is a malformatted page" % title)
        return []

    return imagelinks
示例#18
0
 def extract(self):
     ms = re.findall(r"<title>(.*)\s-\sYouTube</title>", self.data)
     p = HTMLParser()
     if ms:
         return p.unescape(ms[0])
     else:
         return ""
示例#19
0
文件: cli.py 项目: 37206/nigra
def group_search(keywords, cookie):
    '''
    парсер групп по ключевым словам, лол
    '''
    from html.parser import HTMLParser
    parser = HTMLParser()
    s = ''
    for word in keywords:
        s += word + ' '
    site = 'http://vk.com/al_groups.php'#поиск группы 
#    site='http://vk.com/al_video.php'# а вот там лежит хэш для видосов
    post = {'act':'server_search', 'al':'1', 'q':s}#волшебный пост
#    post={'act':'show','al':'1','module':'vieo','video':'100457938_162516488'}
    data = req.post(site,post)  
    html = parser.unescape(data.text)
#    print(html)
#    sys.exit
    html_pre = html.strip().splitlines()
    groups = []
    line = 'd'
    group_stat = collections.namedtuple('group_stat', ['path', 'name', 'num'])
    nstr = 0
    for line in html_pre:
        line = line.lstrip()
        if line.lstrip().startswith('<div class="group_row_labeled"><a href='):
            #еще немного волшебства
            temp1 = re.search(r'(?<=<div class="group_row_labeled"><a href=")/\w+', line).group()
            temp2 = re.sub(r'<.+?>', '', line)
            nstr = 1
        elif nstr == 1:
            nstr = 2
        elif nstr == 2:
            groups.append(group_stat(temp1, temp2, re.search(r'\d+', line).group()))
            nstr = 0
    return groups
示例#20
0
def __parseResultsArea1(resultsArea):
    """
    Parses <div id="resultsArea">...</div> from Bing! history page
    Returns a list of queries (can be empty list)
    """
    startMarker = '<span class="query_t">'
    startMarkerLen = len(startMarker)

    history = []
    htmlParser = HTMLParser()

    s = 0
    while True:
        s = resultsArea.find(startMarker, s)
        if s == -1: break

# locate a query
        s += startMarkerLen
        s = resultsArea.index("<a ", s)
        s += 3
        s = resultsArea.index(">", s)
        s += 1
        e = resultsArea.index("</a>", s)

# resultsArea[s:e] now contains a query from history
        history.append(htmlParser.unescape(resultsArea[s:e]).strip())

        s = e + 4

    return history
示例#21
0
def __parseResultsArea2(resultsArea):
    """
    Parses results from Bing! history page
    Returns a list of queries (can be empty list)
    """
    startMarker = '<span class="sh_item_qu_query">'
    startMarkerLen = len(startMarker)

    history = []
    htmlParser = HTMLParser()

    s = 0
    while True:
        s = resultsArea.find(startMarker, s)
        if s == -1: break

# locate a query
        s += startMarkerLen
        e = resultsArea.index("</span>", s)

# resultsArea[s:e] now contains a query from history
        history.append(htmlParser.unescape(resultsArea[s:e]).strip())

        s = e + 7

    return history
示例#22
0
def check_gplay(app):
    time.sleep(15)
    url = 'https://play.google.com/store/apps/details?id=' + app.id
    headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux i686; rv:18.0) Gecko/20100101 Firefox/18.0'}
    req = urllib.request.Request(url, None, headers)
    try:
        resp = urllib.request.urlopen(req, None, 20)
        page = resp.read()
    except urllib.error.HTTPError as e:
        return (None, str(e.code))
    except Exception as e:
        return (None, 'Failed:' + str(e))

    version = None

    m = re.search('itemprop="softwareVersion">[ ]*([^<]+)[ ]*</div>', page)
    if m:
        html_parser = HTMLParser()
        version = html_parser.unescape(m.group(1))

    if version == 'Varies with device':
        return (None, 'Device-variable version, cannot use this method')

    if not version:
        return (None, "Couldn't find version")
    return (version.strip(), None)
示例#23
0
def twitch_lookup(location):
    locsplit = location.split("/")
    if len(locsplit) > 1 and len(locsplit) == 3:
        channel = locsplit[0]
        type = locsplit[1]  # should be b or c
        id = locsplit[2]
    else:
        channel = locsplit[0]
        type = None
        id = None
    h = HTMLParser()
    fmt = "{}: {} playing {} ({})"  # Title: nickname playing Game (x views)
    if type and id:
        if type == "b":  # I haven't found an API to retrieve broadcast info
            soup = http.get_soup("http://twitch.tv/" + location)
            title = soup.find('span', {'class': 'real_title js-title'}).text
            playing = soup.find('a', {'class': 'game js-game'}).text
            views = soup.find('span', {'id': 'views-count'}).text + " view"
            views = views + "s" if not views[0:2] == "1 " else views
            return h.unescape(fmt.format(title, channel, playing, views))
        elif type == "c":
            data = http.get_json("https://api.twitch.tv/kraken/videos/" + type + id)
            title = data['title']
            playing = data['game']
            views = str(data['views']) + " view"
            views = views + "s" if not views[0:2] == "1 " else views
            return h.unescape(fmt.format(title, channel, playing, views))
    else:
        data = http.get_json("http://api.justin.tv/api/stream/list.json?channel=" + channel)
        if data and len(data) >= 1:
            data = data[0]
            title = data['title']
            playing = data['meta_game']
            viewers = "\x033\x02Online now!\x02\x0f " + str(data["channel_count"]) + " viewer"
            print(viewers)
            viewers = viewers + "s" if not " 1 view" in viewers else viewers
            print(viewers)
            return h.unescape(fmt.format(title, channel, playing, viewers))
        else:
            try:
                data = http.get_json("https://api.twitch.tv/kraken/channels/" + channel)
            except:
                return
            title = data['status']
            playing = data['game']
            viewers = "\x034\x02Offline\x02\x0f"
            return h.unescape(fmt.format(title, channel, playing, viewers))
示例#24
0
 def downloaded_to_intermediate(self, basefile, attachment=None):
     # Check to see if this might not be a proper SFS at all
     # (from time to time, other agencies publish their stuff
     # in SFS - this seems to be handled by giving those
     # documents a SFS nummer on the form "N1992:31". Filter
     # these out.
     if basefile.startswith('N'):
         raise IckeSFS("%s is not a regular SFS" % basefile)
     filename = self.store.downloaded_path(basefile)
     try:
         t = TextReader(filename, encoding=self.source_encoding)
     except IOError:
         self.log.warning("%s: Fulltext is missing" % basefile)
         # FIXME: This code needs to be rewritten
         baseuri = self.canonical_uri(basefile)
         if baseuri in registry:
             title = registry[baseuri].value(URIRef(baseuri),
                                             self.ns['dcterms'].title)
             desc.value(self.ns['dcterms'].title, title)
         desc.rel(self.ns['dcterms'].publisher,
                  self.lookup_resource("Regeringskansliet"))
         desc.value(self.ns['dcterms'].identifier, "SFS " + basefile)
         doc.body = Forfattning([Stycke(['Lagtext saknas'],
                                        id='S1')])
     # Check to see if the Författning has been revoked (using
     # plain fast string searching, no fancy HTML parsing and
     # traversing)
     if not self.config.keepexpired:
         try:
             t.cuepast('<i>Författningen är upphävd/skall upphävas: ')
             datestr = t.readto('</i></b>')
             if datetime.strptime(datestr, '%Y-%m-%d') < datetime.today():
                 self.log.debug('%s: Expired' % basefile)
                 raise UpphavdForfattning("%s is an expired SFS" % basefile,
                                          dummyfile=self.store.parsed_path(basefile))
             t.seek(0)
         except IOError:
             t.seek(0)
     t.cuepast('<pre>')
     # remove &auml; et al
     try:
         # this is the preferred way from py34 onwards. FIXME: Move
         # this to ferenda.compat
         import html
         txt = html.unescape(t.readto('</pre>'))
     except ImportError:
         # this is the old way.
         hp = HTMLParser()
         txt = hp.unescape(t.readto('</pre>'))
     if '\r\n' not in txt:
         txt = txt.replace('\n', '\r\n')
     re_tags = re.compile("</?\w{1,3}>")
     txt = re_tags.sub('', txt)
     # add ending CRLF aids with producing better diffs
     txt += "\r\n"
     util.writefile(self.store.intermediate_path(basefile), txt,
                    encoding=self.source_encoding)
     return codecs.open(self.store.intermediate_path(basefile),
                        encoding=self.source_encoding)
示例#25
0
def index():
    response = requests.get(
        'http://oraalu.fe.up.pt:8888/aulas/WEB_DATA.json_alus', auth=HTTPBasicAuth('ei12060', 'Vitor'))
    h = HTMLParser()

    document = json.loads(h.unescape(response.text))
    alus.insert(document)
    return "OK"
示例#26
0
    def get_entries(self):
        entries = []

        try:
            feed = self.get_feed()
            entries = feed["entries"]
        except Exception:
            # clear the cache so we try again
            key = RSS_JOBS_KEY % (self.org.id, self.id)
            cache.delete(key)
            pass

        html_parser = HTMLParser()
        for entry in entries:
            summary = entry.get("summary", "")
            entry["summary"] = strip_tags(html_parser.unescape(html_parser.unescape(summary)))
        return entries
示例#27
0
文件: eqbot.py 项目: codl/eqbot
def fetchTitle(url):
    page = ur.urlopen(ur.Request(url, data=None,
        headers={'User-Agent': "Mozilla/5.0 Python-urllib/2.6 EqBot"}))
    enc = page.info().get("Content-Type").partition("=")[2]
    if enc == "": enc = "utf-8"
    try:
        return " ".join(HTMLParser.unescape(HTMLParser, re.search(b"<title>(.*?)</title>", page.read(10000).translate(None,b"\n\r\t"), flags = re.I | re.M).expand(b"\\1").decode("utf-8")).split())
    except (AttributeError, HTTPError, URLError):
        return None
示例#28
0
    def handle_link_posted(self, data, match):
        if settings.POST_URLS.get(data["target"]) is None:
            self.logger.debug(
                'No POST_URLS in the settings for this channel. Cannot post.')
            return None

        if settings.POST_URLS_SALTS.get(data['target']) is None:
            self.logger.debug(
                'No security token for this channel. Cannot post.')
            return None

        url = match.group(1)

        # text after the url is assumed to be a title
        suggested_title = match.group(3)

        submission_salt = settings.POST_URLS_SALTS[data['target']]

        payload = {
            "url": url,
            "person": data['nick'],
            "submission_salt": submission_salt,
        }

        if suggested_title:
            payload["title"] = suggested_title

        post_url = settings.POST_URLS[data["target"]]
        response = requests.post(post_url,
                                 data=json.dumps(payload),
                                 headers={'content-type': 'application/json'})

        if response.status_code != 200:
            self.send('Link Scraper Error: {}'.format(response.text), data)

        self.logger.debug('Posted {url} to {post_url}. Response was {text}. Response code was {code}'.format(
            code=response.status_code,
            url=url,
            text=response.text,
            post_url=post_url))

        if settings.POST_URL_TITLES and \
           settings.POST_URL_TITLES.get(data["target"]):
            head = requests.head(url)
            content_type = head.headers['content-type'].split(' ')[0].strip(';')
            if content_type == 'text/html':
                request = requests.get(url, headers={
                    'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Ubuntu Chromium/44.0.2403.89 Chrome/44.0.2403.89 Safari/537.36'
                    })
                soup = BeautifulSoup(request.text)
                if soup.title is not None:
                    parser = HTMLParser()
                    title = soup.title.string
                    title = parser.unescape(title)
                    title = title.strip()  # kill newlines and whitespace...
                    self.send('Link title: {}'.format(title), data)
示例#29
0
 def get_body(text):
     blog_page = html.fromstring(text)
     result = blog_page.cssselect('#blogContent')
     if result:
         blogcontent_div = blog_page.cssselect('#blogContent')[0]
         h = HTMLParser()
         body_text = h.unescape(etree.tostring(blogcontent_div).decode('utf-8'))
         return body_text
     else:
         return None
示例#30
0
    def post(self, message):
        h = HTMLParser()
        message = h.unescape(message)
        url = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', message)
        self.auth = tweepy.OAuthHandler(TWITTER['consumer_key'], TWITTER['consumer_secret'])
        self.auth.set_access_token(TWITTER['access_token'], TWITTER['access_token_secret'])

        self.api = tweepy.API(self.auth)

        logger.debug('Posting to Twitter')
        self.api.update_status(status=message)
示例#31
0
def render_html_diff(text1, text2, diff_id="diff_id"):
    """Render html diff between text1 and text2
       text1 and text2 will be normalized"""
    parser = HTMLParser()
    normalized_space_t1 = prepare_text_spaces(text1)
    normalized_space_t2 = prepare_text_spaces(text2)
    result = htmldiff.render_html_diff(normalized_space_t1,
                                       normalized_space_t2)
    soup, result = format_spaces(result)
    soup, result = normalize_diff(soup, diff_id)
    soup = wrap_diff(soup, diff_id)
    result = tag_to_text(soup.body)
    result = parser.unescape(result)
    return soup, result
示例#32
0
def __get_prjinfo_all__(clsWeb):

    #获取外出项ID,然后删除之
    strUrl = "util/frametree/OpensingleXtreeAction.do?datatype=son&openid=attendance_project&rootname=项目工作&conds=projectname@like&param=508,260&keyname=projectid"
    clsWeb.__set_param__(strUrl)
    r = requests.get(clsWeb.Referer, cookies=clsWeb.cookie)

    #lxml操作
    html = etree.HTML(r.text)

    #筛选数据,14个td才是数据开始
    rr = html.xpath("/descendant::script[position()=14]")
    nLen = len(rr)
    if (nLen > 1):
        return (False, pDict)

    #初步解析,分解树结构
    strTemp = str(etree.tostring(rr[0]))
    strTemps = strTemp.split('new xyTree.NodeNormal')

    nLen = len(strTemps)
    if (nLen < 2):
        return (False, pDict)

    #中文网页编码解析对象
    htmlparser = HTMLParser()

    #循环提取所有并解析
    for i in range(1, nLen):
        strTemp2 = str(strTemps[i])
        strTemps2 = strTemp2.split(' = ')

        #提取有值对象
        if (len(strTemps2) > 0):
            #for j in range(0, len(strTemps2)):
            #print(strTemps2[j])

            strName = clsWeb.Get_EleStr(strTemps2[0], "(\\'", "\\');")
            strKey = clsWeb.Get_EleStr(strTemps2[1], "\\'", "\\';")

            #中文转换
            strName = htmlparser.unescape(strName)
            pDict[strName] = strKey
            #print(strName)
            #print(pDict[strName])

    #保存项目字典
    path = "./mySetting/Prj_Dict.csv"
    __write_prjinfo_all__(pDict, path)
    return (True, pDict)
示例#33
0
def decryption(request):
    if request.method == 'POST':
        sender = request.POST['sender']
        try:
            driver = webdriver.Chrome(
            )  # Optional argument, if not specified will search path.
            driver.get('https://web.whatsapp.com/')
            time.sleep(10)  # Let the user actually see something!
            search_box = driver.find_element_by_css_selector(
                "input[title*='Search or start new chat']").click()
            # time.sleep(5)
            search_box = driver.find_element_by_css_selector(
                "input[title*='Search or start new chat']").send_keys(sender)
            #time.sleep(5)
            search_box = driver.find_element_by_css_selector(
                "span[title='%s']" % sender).click()

            s = ""
            element = driver.find_elements_by_css_selector(
                "span[class*='_F7Vk selectable-text invisible-space copyable-text']"
            )

            s = element[-1].get_attribute('innerHTML')
            data = s.split(">")
            s = data[1]
            da = s.split("<")
            mess = da[0]
            h = HTMLParser()
            mes = h.unescape(mess)
            print()
            print(mes + "SSS")
            print()
            mes = Decryptmsg(mes)
            print("HH" + mes + "HH")
            return render(request, 'decrypted.html', {
                'mes': mes,
                'sender': sender
            })

        except:
            print('NOT SEND')
            messages.success(
                request,
                'Try Again.. \n Check internet connection or write the username as saved in contacts list'
            )
            return render(request, 'decrypt.html', {})
        return render(request, 'home.html', {})
    else:
        return render(request, 'decrypt.html', {})
示例#34
0
 def __processResponse(self):
     self.lastresponse.raise_for_status()  # -> make sure it is 200
     self.lasthtml = HTMLParser.unescape(HTMLParser, self.lastresponse.text)
     parser = DDHtmlParser()
     parser.feed(self.lasthtml)
     self.lastdata = parser.return_data()
     try:
         contenttype = self.lastdata['meta']['http-equiv']['content-type']
         # "text/html; charset=iso-8859-1"
         self.lastencoding = rematch('^.*charset=([^;]*).*',
                                     contenttype).group(1)
         log.info('content-type: ' + contenttype)
         log.info('encoding: ' + self.lastencoding)
     except Exception:
         pass
示例#35
0
    def all_options(self):
        """Returns a list of tuples of all the options in the Select.

        Text first, value follows.


        Returns:
            A :py:class:`list` of :py:class:`Option`
        """
        # More reliable using javascript
        options = self.browser.execute_script(self.ALL_OPTIONS, self.browser.element(self))
        parser = HTMLParser()
        return [
            self.Option(normalize_space(parser.unescape(option[0])), option[1])
            for option in options]
示例#36
0
def DecodeQuery1(fileName):
    data = [x.strip() for x in open(fileName, "r").readlines()]
    query_list = []
    for item in data:
        item = item.lower()
        if len(item) > 50 or len(item) < 5:
            continue
        h = HTMLParser()
        item = h.unescape(item)  #将&gt或者&nbsp这种转义字符转回去
        item = parse.unquote(item)  #解码,就是把字符串转成gbk编码,然后把\x替换成%。如果
        item, number = re.subn(r'\d+', "8", item)  #正则表达式替换
        item, number = re.subn(r'(http|https)://[a-zA-Z0-9\.@&/#!#\?:]+',
                               "http://u", item)
        query_list.append(item)
    return query_list
示例#37
0
def convert_unicode(text):
	"""
	converts unicode HTML to real Unicode
	"""
	if isPython2:
		h = HTMLParser()
		s = h.unescape(text)
	else:
		try:
			s = unescape(text)
		except Exception:
			# Python 3.3 and below
			# https://stackoverflow.com/a/2360639/2295672
			s = HTMLParser().unescape(text)
	return s
示例#38
0
    def find_user_id(url):
        """
        Find the user@id from flickr by loading the user-url and parsing the id (kind of hacky)
        """
        html = urllib.request.urlopen(url).read().decode('utf-8')

        m = re.search(r"href=\"/services/feeds/photos_public.gne\?([^\"]+)",
                      html)
        if m:
            h = HTMLParser()
            uid = h.unescape(m.group(1))
            uid = uid[3:uid.index("&")]
            return uid
        else:
            return None
示例#39
0
    def post(self, message):
        h = HTMLParser()
        message = h.unescape(message)
        url = re.findall(
            'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',
            message)
        self.auth = tweepy.OAuthHandler(TWITTER['consumer_key'],
                                        TWITTER['consumer_secret'])
        self.auth.set_access_token(TWITTER['access_token'],
                                   TWITTER['access_token_secret'])

        self.api = tweepy.API(self.auth)

        logger.debug('Posting to Twitter')
        self.api.update_status(status=message)
示例#40
0
def normalize(text):
    if text is None:
        return None
    if text[:5] == 'orcid':
        text = text.split('>')[1]
    h = HTMLParser()
    text = str(h.unescape(text))
    normalizr = Normalizr(language='en')
    normalizations = [('replace_punctuation', {
        'replacement': ' '
    }), 'lower_case', ('remove_stop_words', {
        'ignore_case': 'False'
    }), 'remove_extra_whitespaces']
    text = normalizr.normalize(text, normalizations)
    return text
示例#41
0
    def plan_page(self, response):
        print(response.encoding)
        soup = BeautifulSoup(response.text, 'lxml', from_encoding="GB18030")
      
        last_page = int(soup.find(id=re.compile(r'LabelPageCount')).get_text())
        if last_page != response.save['page']:
            parmas = {'__EVENTTARGET': 'GridView1$ctl23$LinkButtonNextPage',
            '__EVENTARGUMENT': '', 
            }
            parmas['__VIEWSTATE'] = soup.find('input', {'name': '__VIEWSTATE'})['value']
            parmas['__EVENTVALIDATION'] = soup.find('input', {'name': '__EVENTVALIDATION'})['value']
            parmas['GridView1$ctl23$tbPage'] = str(response.save['page'])

            data = urlencode(parmas)
            temp = response.orig_url.split('&')
            url = temp[0]
            for i in temp[1:-1]:
                url += '&' + i
            print(url)
            url = url + '&' + str(response.save['page'] + 1)
            save_dict = response.save
            save_dict['page'] = save_dict['page'] + 1
            self.crawl(url, method='POST',
                data=data, callback=self.plan_page, age=1, save=save_dict)

        content = soup('a', {'target':'_blank'})
        print(soup.original_encoding)
        domains = {}
        domains[self.table_name[0]] = 'http://121.10.6.230/dggsweb/PHGSFJ/PHjsxmxzFJ.aspx?'
        domains[self.table_name[1]] = 'http://121.10.6.230/dggsweb/PHGSFJ/PHjsydghFJ.aspx?'
        domains[self.table_name[2]] = 'http://121.10.6.230/dggsweb/PHGSFJ/PHjsgcghFJ.aspx?' 
        domains[self.table_name[9]] = 'http://121.10.6.230/dggsweb/PQGSFJ/PQszFJ.aspx?'
        domains[self.table_name[10]] = 'http://121.10.6.230/dggsweb/PQGSFJ/PQyslzFJ.aspx?'
        domains[self.table_name[11]] = 'http://121.10.6.230/dggsweb/PQGSFJ/PQyslzFJ.aspx?'

        for i in content:
            link = i['href']
            h = HTMLParser()
            link = h.unescape(link)
            params = link.split('?')[1]
            params = params.split('&')
            print(params)
            link = ''
            link += quote('项目受理编号'.encode('gbk')) + '=' + quote(params[0].split('=')[1].encode('GB18030')) + '&' + quote('公示'.encode('gbk')) + '=' + quote(params[1].split('=')[1].encode('GB18030'))
            domain = domains[response.save['type']]
            link = domain + link
            # print(link)
            self.crawl(link, callback=self.content_page, save=response.save)
示例#42
0
def search_gpu(name):
    index = name.find("GB")
    if index == -1:
        pass
    else:
        name = name[0:index - 2]

    name = name.lower()
    name = name.replace("asus", "")
    name = name.replace("nvidia", "")
    name = name.replace("msi", "")
    name = name.replace("gigabyte", "")
    name = name.replace("pascal", "")
    name = name.replace("evga", "")
    name = name.replace("sapphire", "")
    name = name.replace("zotac", "")
    name = name.replace("amd", "")
    name = name.replace("frontier edition", "")
    name = name.replace("gainward", "")
    name = name.replace("xfx", "")
    name = name.replace("powercolor", "")
    name = name.replace("intel", "")
    name = name.replace("pro hd", "")
    name = name.replace("graphics", "")
    name = name.replace("ati", "")
    name = eraseFromString(name, "(", ")")
    name = name.replace("  ", " ")
    str = "https://www.passmark.com/search/zoomsearch.php?zoom_query=" + name + " price performance"
    try:
        r = requests.get(str).text
    except:
        return "RE"

    r = remove_html_markup(r)
    h = HTMLParser()
    r = h.unescape(r)
    index2 = r.find("- Price")
    if index2 == -1:
        index2 = len(r)

    index = r.rfind("PassMark -", 0, index2)
    if index == -1:
        return "NF"
    ret = r[index + 11:index2 - 1]

    if len(ret) > 50:
        return "NF"
    return ret
示例#43
0
 def get_lyrics(self, song):
     log('%s: searching lyrics for %s - %s' %
         (__title__, song.artist, song.title),
         debug=self.DEBUG)
     lyrics = Lyrics(settings=self.settings)
     lyrics.song = song
     lyrics.source = __title__
     lyrics.lrc = __lrc__
     try:
         req = requests.get(self.url % (urllib.parse.quote(
             song.artist), urllib.parse.quote(song.title)),
                            timeout=10)
         response = req.text
     except:
         return None
     data = json.loads(response)
     try:
         self.page = data['url']
     except:
         return None
     if not self.page.endswith('action=edit'):
         log('%s: search url: %s' % (__title__, self.page),
             debug=self.DEBUG)
         try:
             req = requests.get(self.page, timeout=10)
             response = req.text
         except requests.exceptions.HTTPError as error:  # strange... sometimes lyrics are returned with a 404 error
             if error.response.status_code == 404:
                 response = error.response.text
             else:
                 return None
         except:
             return None
         matchcode = re.search("class='lyricbox'>(.*?)<div", response)
         try:
             lyricscode = (matchcode.group(1))
             htmlparser = HTMLParser()
             lyricstext = htmlparser.unescape(lyricscode).replace(
                 '<br />', '\n')
             lyr = re.sub('<[^<]+?>', '', lyricstext)
             if LIC_TXT in lyr:
                 return None
             lyrics.lyrics = lyr
             return lyrics
         except:
             return None
     else:
         return None
def svm_dev_clean_data(json_data):
    clean_json_data = json_data
    html_parser = HTMLParser()
    remove_digits = str.maketrans('', '', digits)
    count = 0
    for obj in clean_json_data:
        # print('Before :' + obj['text'])
        obj['text'] = obj['text'].lower()
        obj['text'] = html_parser.unescape(obj['text'])
        obj['text'] = obj['text'].translate(remove_digits)
        obj['text'] = ' '.join(obj['text'].split())
        obj['text'] = strip_links(obj['text'])
        # obj['text'] = strip_all_entities(obj['text'])
        # print('After :' + obj['text'])
        count += 1
    return clean_json_data
示例#45
0
 def format_transcript_text(self, text):
     """
     Prepare unicode transcripts to be converted to WebVTT format.
     """
     new_text = [
         self.format_transcript_text_line(line)
         for line in text[0].splitlines()
     ]
     new_text = '\n'.join(new_text)
     html_parser = HTMLParser()
     unescaped_text = html_parser.unescape(new_text)
     if u"WEBVTT" not in text:
         text = "WEBVTT\n\n" + unescaped_text
     else:
         text = unescaped_text
     return text
示例#46
0
def convert_data(file_path):
    full_data = []
    html_parser = HTMLParser()
    with open(file_path, "r", encoding="utf-8") as file:
        for line in file.readlines():
            temp_data = json.loads(line)
            # €: alt + 0128
            data = temp_data["id"] + "€" + temp_data["title"] + "€" + \
                        temp_data["content"].replace("\n", "") + "€" + \
                        str(temp_data["tags"]).strip("[").strip("]").replace("'","").replace(" ", "")
            data = html_parser.unescape(data).replace(
                "\n", "")  # 为了将爬虫中HTML语言的的转义字符,如&amp, &lt, &gt等转回正常的符号。
            full_data.append(data)
    print("Successfully convert data!")
    print("Full Data Size: ", len(full_data))
    return full_data
示例#47
0
def ogdch_localize_activity_item(msg):
    """localizing activity messages: this function gets an html message and
    replaces the language dict in there with the localized value
    """
    parser = HTMLParser()
    unescaped_msg = parser.unescape(msg)

    language_dict_result = re.search(REGEX_LANGUAGE_DICT, unescaped_msg)
    if not language_dict_result:
        return tk.literal(msg)

    language_dict = language_dict_result.group(0)
    localized_language_dict = get_localized_value_for_display(language_dict)
    localized_msg = unescaped_msg.replace(language_dict,
                                          localized_language_dict)
    return tk.literal(localized_msg)
    def downloadFile(self, row, code_directory):

        try:
            # Extract different parameter from the row
            contestId = str(row["problem"]["contestId"])
            problemIndex = str(row["problem"]["index"])
            submissionId = str(row["id"])
            problemName = str(row["problem"]["name"]).replace(" ", "-")
            submissionLanguage = row["programmingLanguage"]
            contestName = self.getContestName(contestId)

            # open submission page for the submission Id
            url = "http://codeforces.com/contest/" + str(
                contestId) + "/submission/" + str(submissionId)
            resp = urlopen(url)
            html = BeautifulSoup(resp, "html.parser")

            logging.info("Problem: " + str(contestName) + "-" +
                         str(problemName))
            # Extract the source code
            sourceCodeDiv = html.body.find('pre',
                                           attrs={'id': 'program-source-text'})

            # Convert html entities to human readable format
            # e.g.: "&lt" becomes "<", "&gt" becomes ">"
            h = HTMLParser()
            code = h.unescape(str(sourceCodeDiv.text))

            # Create folder according to contest name
            save_directory = code_directory + os.path.sep + str(contestName)

            if not os.path.exists(save_directory):
                logging.info("Creating Directory: " + save_directory)
                os.mkdir(save_directory)

            # Save the source code in file
            filename = save_directory + os.path.sep + problemIndex + "-" + problemName + "." + self.getFileExtension(
                submissionLanguage)
            logging.info("Saving Source code in file:" +
                         str(os.path.basename(filename)))
            with open(filename, "w+") as f:
                f.write(str(code))
        except Exception as e:
            logging.info("Error while downloading file !!!")
            raise

        return
示例#49
0
async def grabSteamNews():
    r = requests.get(
        'http://api.steampowered.com/ISteamNews/GetNewsForApp/v0002/?appid=730&count=1&maxlength=300&format=json'
    )

    decoded = json.loads(r.text)
    htmlparser = HTMLParser()

    data = decoded["appnews"]["newsitems"][0]
    values = [
        data["url"], data["feedlabel"], data["title"],
        htmlparser.unescape(data["contents"]), data["gid"]
    ]
    return (values)


#print(grabSteamNews())
示例#50
0
def get_json_row(cacheid):
    """ return a row as json """

    cacheid = cacheid.upper()
    conn = mysqlite.check_db()
    g_c = get_row(conn, cacheid)
    close_db(conn)

    if g_c is None:
        return "{}"

    html = HTMLParser()
    g_c.cachename = html.unescape(g_c.cachename)
    g_c.dltime = int(time.time()) - int(g_c.dltime)
    g_c.body = htmlcode.remove_all_attrs_except(g_c.body)

    return str(g_c)
示例#51
0
def html_xml():
    s = 'Elements are written as "<tag>text</tag>".'
    print(s)
    print(html.escape(s))

    # Disable escaping of quotes
    print(html.escape(s, quote=False))

    s = 'Spicy Jalapeño'
    print(s.encode('ascii', errors='xmlcharrefreplace'))

    s = 'Spicy &quot;Jalape&#241;o&quot.'
    p = HTMLParser()
    print(p.unescape(s))

    t = 'The prompt is &gt;&gt;&gt;'
    print(unescape(t))
示例#52
0
def main():
    s = 'Elements are written as "<tag>text</tag>".'
    print(s)
    print(html.escape(s))
    print("Disable quotes escape")
    print(html.escape(s, quote=False))

    print("Displaying non-asc text as asc")
    s = 'Spicy Jalapeño'
    print(s.encode('ascii', errors='xmlcharrefreplace'))

    s = 'Spicy &quot;Jalape&#241;o&quot.'
    p = HTMLParser()
    print(p.unescape(s))

    t = 'The prompt is &gt;&gt;&gt;'
    print(unescape(t))
示例#53
0
    def save_videojs(self, data, suffix=''):
        """
        The saving handler.
        """
        i18n_ = self.runtime.service(self, "i18n").ugettext

        self.display_name = data['display_name']
        self.url = data['url'].strip()

        for language in self.languages.keys():
            subtitle_text = data['subtitle_text_' + language].strip()
            if subtitle_text:
                reader = detect_format(subtitle_text)
                if reader:
                    try:
                        subtitle = WebVTTWriter().write(
                            reader().read(subtitle_text))
                    except:
                        return Response(json.dumps({
                            'error':
                            i18n_(
                                "Error occurred while saving VTT subtitles for language %s"
                            ) % language.upper()
                        }),
                                        status=400,
                                        content_type='application/json',
                                        charset='utf8')
                    h = HTMLParser()
                    self.subtitles[language] = h.unescape(subtitle)

                    self.create_subtitles_file(self.subtitles[language])
                else:
                    return Response(json.dumps({
                        'error':
                        i18n_(
                            "Error occurred while saving VTT subtitles for language %s"
                        ) % language.upper()
                    }),
                                    status=400,
                                    content_type='application/json',
                                    charset='utf8')
            else:
                self.subtitles[language] = ""

        return {'result': 'success'}
示例#54
0
def parse_p4p(response):
    resp_json = response.json()

    total_page = resp_json['totalPage']
    current_page = resp_json['currentPage']
    next_page = current_page + 1 if current_page < total_page else None

    p4ps = list()
    data = resp_json['data']
    parser = HTMLParser()
    for item in data:
        p4ps.append(P4P(
            keyword=parser.unescape(item['keyword']),
            qs_star=item['qsStar'],
            is_start=(item['state']=="1"),
            tag=item['tag'],
        ))
    return next_page, p4ps
示例#55
0
def DecodeQuery2(fileName1, fileName2):
    data1 = [x.strip() for x in open(fileName1, "r").readlines()]
    data2 = [x.strip() for x in open(fileName2, "r").readlines()]
    query_list = []
    time_list = []
    for item1, item2 in zip(data1, data2):
        item1 = item1.lower()
        if len(item1) > 50 or len(item1) < 5:
            continue
        h = HTMLParser()
        item1 = h.unescape(item1)  #将&gt或者&nbsp这种转义字符转回去
        item1 = parse.unquote(item1)  #解码,就是把字符串转成gbk编码,然后把\x替换成%。如果
        item1, number = re.subn(r'\d+', "8", item1)  #正则表达式替换
        item1, number = re.subn(r'(http|https)://[a-zA-Z0-9\.@&/#!#\?:]+',
                                "http://u", item1)
        query_list.append(item1)
        time_list.append(item2)
    return query_list, time_list
示例#56
0
def decode_hash(hashed):
    """Use this function, Luke."""
    email = ''
    prefix = '0x' + hashed[0:2]
    slicer = 2
    while True:
        try:
            email += '&#' + \
                ('0' +
                 hex(int(
                     '0x' + str(
                         int(hashed[slicer:slicer + 2], 16) ^ int(prefix, 16)),
                     16))
                 )[3:]
            slicer += 2
        except ValueError:
            parser = HTMLParser()
            return parser.unescape(result)
示例#57
0
def remove_twitter_syntax(text):
    """
    Removes special words or syntax from Twitter, including RT, users, urls

    :param text: tweet to be cleaned
    :return: str with the clean tweet
    """
    html_parser = HTMLParser()
    text = html_parser.unescape(text)
    text = re.sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "", text,
                  flags=FLAGS)  # remove urls
    text = re.sub(r"@\w+", "", text, flags=FLAGS)  # remove users
    text = re.sub(r"<user>", "", text, flags=FLAGS)
    text = re.sub(r"<url>", "", text, flags=FLAGS)
    text = re.sub('RT[\s]+', "", text, flags=FLAGS)  # removes RT
    text = re.sub('rt[\s]+', "", text, flags=FLAGS)  # removes RT
    text = re.sub(' +', ' ', text, flags=FLAGS)
    return text.strip()
示例#58
0
 def transform_old_subtitle_to_new_form_if_exist(self, modify=False):
     """
     It is possible to modify self.subtitles only in studio_view and save_videojs functions, so in studio_view we only dynamically modify
     self.subtitles dict, but in studio_view pernamently.
     """
     i18n_ = self.runtime.service(self, "i18n").ugettext
     subtitles = copy.deepcopy(self.subtitles)
     if (not 'pl' in self.subtitles
         ) and self.subtitle_url and self.subtitle_text:
         reader = detect_format(self.subtitle_text)
         if reader:
             try:
                 subtitle = WebVTTWriter().write(reader().read(
                     self.subtitle_text))
             except:
                 return Response(json.dumps({
                     'error':
                     i18n_(
                         "Error occurred while saving VTT subtitles for language PL"
                     )
                 }),
                                 status=400,
                                 content_type='application/json',
                                 charset='utf8')
             h = HTMLParser()
             subtitles['pl'] = h.unescape(subtitle)
             if modify:
                 self.subtitles = subtitles
                 self.subtitle_url = self.subtitle_text = ""
             else:
                 return subtitles
         else:
             return Response(json.dumps({
                 'error':
                 i18n_(
                     "Error occurred while saving VTT subtitles for language PL"
                 )
             }),
                             status=400,
                             content_type='application/json',
                             charset='utf8')
     else:
         return subtitles
示例#59
0
    def query(self, qry):
        params = {
            "action":
            "feedcontributions",
            "user":
            qry.encode('raw_unicode_escape').decode("ascii", errors='replace')
        }

        if self.opts['days_limit'] != "0":
            dt = datetime.datetime.now() - datetime.timedelta(
                days=int(self.opts['days_limit']))
            params["year"] = dt.strftime("%Y")
            params["month"] = dt.strftime("%m")

        res = self.sf.fetchUrl(
            f"https://en.wikipedia.org/w/api.php?{urllib.parse.urlencode(params)}",
            timeout=self.opts['_fetchtimeout'],
            useragent="SpiderFoot")

        if res['code'] in ["404", "403", "500"]:
            self.debug(
                f"Unexpected response code {res['code']} from Wikipedia")
            return None

        if not res['content']:
            return None

        links = list()

        try:
            parser = HTMLParser()

            for line in res['content'].split("\n"):
                matches = re.findall("<link>(.*?)</link>", line, re.IGNORECASE)
                for m in matches:
                    if "Special:Contributions" in m:
                        continue
                    d = parser.unescape(m)
                    links.append(d)
            return set(links)
        except Exception as e:
            self.error(f"Error processing response from Wikipedia: {e}")
            return None
示例#60
0
    def correct(self, text):
        # grab html
        html = self.get_page('http://www.google.com/search?hl=en&q=' + urllib.parse.quote(text) + "&meta=&gws_rd=ssl")
        html_parser = HTMLParser()

        # save html for debugging
        # open('page.html', 'w').write(html)

        # pull pieces out
        match = re.search(r'(?:Showing results for|Did you mean|Including results for).*?<a.*?>(.*?)</a>', html)
        if match is None:
            fix = text
        else:
            fix = match.group(1)
            fix = re.sub(r'<.*?>', '', fix)
            fix = html_parser.unescape(fix)

        # return result
        return fix