示例#1
0
 def extract_seo_h1_tag(self):
     'Get infos from seo_h1_tag'
     html = self.chrome.get_inner_html_by_id('seo_h1_tag')
     if html == None:  # no seo_h1_tag?
         return None
     name = rsub('"[^"]*"', '', html)
     name = rsub('<[^>]*>', '', name)
     if name != '':
         account = {'name': name}
     else:
         account = {'name': 'undetected'}
     href = self.ct.search(' href="https://www\.facebook\.com/[^/?"]+',
                           html)
     if href != None:
         account['type'] = 'pg'
         account['path'] = href[32:]
         account['link'] = href[7:]
         account['id'] = 'undetected'
     else:
         href = self.ct.search(' href="/groups/[^/?"]+', html)
         if href != None:
             account['type'] = 'groups'
             account['path'] = 'groups_' + href[15:]
             account['link'] = 'https://facebook.com' + href[7:]
             account['id'] = 'undetected'
         else:
             return None
     return account
示例#2
0
 def extract_targets(self, target):
     'Extract paths (= URLs without ...instagram.com/) from given targets'
     l = []  # list for the target users (id or path)
     for i in self.ct.split(target):
         i = rsub('^.*instagram\.com/', '', i)
         i = rsub('/.*$', '', i)
         if i != '' and i != 'p':
             l.append(i)
     return l
示例#3
0
 def extract_paths(self, target):
     'Extract facebook paths from target that might be urls'
     l = []  # list for the target users (id or path)
     for i in self.ct.split(target):
         i = rsub('^.*facebook.com/', '', i.rstrip('/'))
         i = rsub('&.*$', '', i)
         if i != '':
             l.append(i)
     return l
示例#4
0
async def translate(message, client, arguments):

    #Cheking if input contains any arguments DO NOT TOUCH WAS PAIN
    try:
        popped = rsearch("--([a-zA-Z0-9])\w+", arguments).group()
    except AttributeError:
        google = quote(str(arguments))
        language = translate_to_lang
    else:
        google = quote(str(rsub(r"--([a-zA-Z0-9])\w+", "", arguments)))
        language = popped[2:]

    #Creating and fetching link
    query = "https://translation.googleapis.com/language/translate/v2?key=%s&target=%s&q=%s" % (
        google_api, language, google)
    response = loads(rget(query).text)

    # Trying to create message
    try:
        detectedlanguage = response["data"]["translations"][0][
            "detectedSourceLanguage"]
        translatedtext = response["data"]["translations"][0]["translatedText"]
        letter = ":cloud:  **| " + detectedlanguage.upper(
        ) + " -> " + language.upper() + "  `" + translatedtext + "`**"

    # if can't create mesage rteturn error
    except KeyError:
        letter = ":cloud:  **| Invalid language target!**"

    # sending message
    await client.send_message(message.channel, letter)
示例#5
0
 def get_profile_name(self, html):
     'Extract name'
     m = rsearch('>[^<]+</a>', html)
     if m != None:
         return m.group()[1:-4]
     m = rsearch('>[^<]+<span[^>]*>[^<]+</span>[^<]*</a>', html)
     if m != None:
         return rsub('<[^>]+>', '', m.group()[1:-4])
     return 'undetected'
示例#6
0
文件: routing.py 项目: polizei/misc
    def _conditionalize(self, args):
        part = args[0]

        next = []
        segments = args[1:]
        for segment in segments:
            next.append(self._conditionalize(segment))
        next = ', '.join(next)
        if next:
            next = ''.join([', ', next]);

        conditions = []
        for name in rfind('[:*](\\w+)', part):
            conditions.append(''.join(['"', name, '" in parameters']))
        if conditions:
            conditions = ' and '.join(conditions)
        else:
            conditions = 'true'

        return ''.join(['((', conditions, ') and "".join(["', rsub('[:*](\\w+)', '", parameters["\\1"], "', part), '"', next, ']) or "")'])
示例#7
0
文件: inflector.py 项目: polizei/misc
	def singularize(cls, what):
		'''Singularizes english words (example: people => person, sheep => sheep, lines => line)'''

		for x in range(len(cls._uncountable) - 1, -1, -1):
			value = cls._uncountable[x][0]

			if value == what[-len(value):].lower():
				return what

		for x in range(len(cls._irregular) - 1, -1, -1):
			key = cls._irregular[x][1]
			value = cls._irregular[x][0]

			if key == what[-len(key):].lower():
				return what[:-len(key)] + value

		for x in range(len(cls._singular) - 1, -1, -1):
			key = cls._singular[x][0]
			value = cls._singular[x][1]

			if rsearch(key, what, I):
				return rsub(key, value, what, I)

		return what
示例#8
0
文件: inflector.py 项目: polizei/misc
	def pluralize(cls, what):
		'''Pluralizes english words (example: person => people, news => news, post => posts)'''

		for x in range(len(cls._uncountable) - 1, -1, -1):
			value = cls._uncountable[x][0]

			if value == what[-len(value):].lower():
				return what

		for x in range(len(cls._irregular) - 1, -1, -1):
			key = cls._irregular[x][0]
			value = cls._irregular[x][1]

			if key == what[-len(key):].lower():
				return what[:-len(key)] + value

		for x in range(len(cls._plural) - 1, -1, -1):
			key = cls._plural[x][0]
			value = cls._plural[x][1]

			if rsearch(key, what, I):
				return rsub(key, value, what, I)

		return what
示例#9
0
 def href(self, html):
     'Search href='
     try:
         return rsub('&amp;', '&', self.search(' href="[^"]+', html)[7:])
     except:
         return None
示例#10
0
文件: inflector.py 项目: polizei/misc
	def urlize(cls, what, delimiter='_'):
		'''Returns the sentense passed as a URL slug (example: what's goin' on out there? => what_s_goin_on_out_there)'''

		return rsub(delimiter + '+', delimiter, rsub('[^0-9a-z]', delimiter, cls.underscore(cls.unaccent(cls.latinize(what))))).strip(delimiter)
示例#11
0
文件: inflector.py 项目: polizei/misc
	def underscore(cls, what):
		'''Underscores a camelized word (example: BlogPosts => blog_posts, MyDBConnector => my_db_connector)'''

		return rsub('([A-Z]+)', r'_\1', rsub('([A-Z]+)([A-Z][a-z])', r'_\1_\2', what)).lstrip('_').lower()
示例#12
0
文件: routing.py 项目: polizei/misc
    def __init__(self, route, **kwargs):
        self._route = route

        # set/generate name
        if 'name' in kwargs:
            self._name = kwargs.pop('name')

        # set common options
        if 'defaults' in kwargs:
            self._defaults = kwargs.pop('defaults')
        if 'constraints' in kwargs:
            self._constraints = kwargs.pop('constraints')
        if 'formats' in kwargs:
            self._formats = kwargs.pop('formats')
        if 'limits' in kwargs:
            self._limits = kwargs('limits')

        # everything else should be map then, unless we hav a map argument passed
        if 'map' in kwargs:
            self._map = kwargs.pop('map')
        else:
            self._map = kwargs

        # check name once again and try to set it against map
        if not self._name:
            if self._map:
                self._name = '_'.join([str(value) for value in self._map.values()])
            else:
                self._name = id(self)

        # scan parameter names and parameter types (optional/required, single/multiple)
        l = len(route)
        x = paren = 0
        while x < l:
            c = route[x]
            x += 1

            if c == '(':
                paren += 1
            elif c == ')':
                paren -= 1
            elif c == ':' or c == '*':
                name = []
                while x < l and route[x].isalnum():
                    name.append(route[x])
                    x += 1
                name = ''.join(name)

                self._parameters.append(name)

                if not paren:
                    self._required.append(name)

                if c == '*':
                    self._multiple.append(name)

        # set defaults for action and format
        if 'action' in self._parameters and not 'action' in self._defaults:
            self._defaults['action'] = 'index'
        if 'format' in self._parameters and not 'format' in self._defaults:
            self._defaults['format'] = 'html'

        # build the regular expression with a regular expression (and a couple of string substitutions, though)
        self._regex = rsub('[:*](\\w+)', self._constraintize, route.replace('(', '(?:').replace(')', ')?').replace('.', '\\.'))

        # build the evaluated urlize code with nested expression and a couple of regular expression substitutions
        expr = nestedExpr('(', ')')
        parsed = expr.parseString(''.join(['(', route, ')']))[0]
        self._code = ''.join(['result = ', self._conditionalize(parsed).replace(', ""', '')])
示例#13
0
def sub(x, y, z):
    return rsub(y, z, x)
示例#14
0
 def src(self, html):
     'Get src='
     try:
         return rsub('&amp;', '&', self.search(' src="[^"]+', html)[6:])
     except:
         return None
示例#15
0
def retrieve():
    database = connect('database.db')
    topics, feeds, documents, titles, descriptions = [[], [], [], [], []]
    links, datetimes, thumbnails, doc_topics = [[], [], [], []]

    ### GET DATABASE DATA
    for row in database.execute('SELECT * FROM topics;'):
        topics.append([row[0], str(row[1])])
    for row in database.execute('SELECT fds_topic, fds_link FROM feeds;'):
        feeds.append([row[0], str(row[1])])
    for row in database.execute(
            'SELECT doc_id, doc_datetime, doc_link FROM documents'):
        documents.append([row[0], str(row[1]), str(row[2]), []])
        for row2 in database.execute(
                'SELECT tpd_topic FROM tpc_doc WHERE tpd_document = ' +
                str(row[0]) + ';'):
            documents[-1][3].append(row2[0])

    ### GET RSS INFO
    for topic, link in feeds:
        html = urlopen(link).read()
        soup = BeautifulSoup(html)
        items = [item for item in soup.find_all('item')]
        for item in items:
            doc_topics.append(topic)
            if item.title is not None:
                title = item.title.findAll(text=True)
                if len(title) == 1:
                    titles.append(title[0].encode('ascii', errors='ignore'))
                else:
                    titles.append('')
            if item.description is not None:
                desc = item.description.findAll(text=True)
                if len(desc) == 1:
                    descriptions.append(desc[0].encode('ascii',
                                                       errors='ignore'))
                else:
                    descriptions.append('')
            if item.guid is not None:
                link = item.guid.findAll(text=True)
                if len(link) == 1:
                    links.append(link[0].encode('ascii', errors='ignore'))
                else:
                    links.append('')
            if item.pubdate is not None:
                date = item.pubdate.findAll(text=True)
                if len(date) == 1:
                    datetimes.append(date[0].encode('ascii', errors='ignore'))
                else:
                    datetimes.append('')
            thumb = item.findChildren('media:thumbnail', {'width': '144'})
            if len(thumb) == 1:
                thumbnails.append(thumb[0]['url'].encode('ascii',
                                                         errors='ignore'))
            else:
                thumbnails.append('')

    ### GET DOCUMENTS
    new = 0
    updated = 0
    for index in range(len(titles)):
        print('(' + str(index + 1).ljust(4) + str(doc_topics[index]).ljust(2) +
              ')'),

        datetime = parser.parse(datetimes[index])
        try:
            pos = [doc[2] for doc in documents].index(links[index])
        except:
            refresh = 0
        else:
            if doc_topics[index] not in documents[pos][3]:
                database.execute('INSERT INTO tpc_doc (tpd_topic, tpd_document) VALUES'+\
                 ' ('+str(doc_topics[index])+', '+str(documents[pos][0])+');')
                documents[pos][3].append(doc_topics[index])
                database.commit()
                print('*'),
            if str(datetime) == str(documents[pos][1]):
                print('Unchanged Article')
                continue
            refresh = 1

        not_article = ('VIDEO', 'AUDIO', 'In pictures', 'Your pictures')
        if titles[index].startswith(not_article):
            print('Not an Article')
            continue

        html = urlopen(links[index]).read()
        soup = BeautifulSoup(html)
        title = str(soup.title)[7:-8].decode('utf-8').encode('ascii',
                                                             errors='ignore')

        temp = [
            'BBC News', 'BBC History', 'BBC Science', 'BBC Consumer',
            'BBC Arts', 'BBC Nature'
        ]
        if any(i in title for i in temp): division = 'story-body'
        elif 'BBC Sport' in title: division = 'article'
        elif 'BBC - Capital' in title: division = 'description|story-body'
        else:
            print('Website not known')
            continue

        content = [
            div for div in soup.find_all('div', {'class': rcompile(division)})
        ]
        soup = BeautifulSoup(' '.join(list(map(str, content))))
        paragraphs = [p for p in soup.findAll('p')]
        soup = BeautifulSoup(' '.join(list(map(str, paragraphs))))
        [
            p.extract() for p in soup.findAll('p')
            if str(p).startswith('<p><strong>')
        ]
        [
            p.extract()
            for p in soup.findAll('p', {'class': rcompile('disclaimer|terms')})
        ]

        text = soup.get_text().replace('\n',
                                       ' ').replace('\t',
                                                    ' ').replace('\r', ' ')
        text = text.encode('ascii', errors='ignore')
        if text == '':
            print('Empty Text')
            continue

        rsub(' +', ' ', text)
        text = text.strip()
        text = '\n'.join([sentence for sentence in sent_tokenize(text)])

        if refresh == 1:
            documents[pos][1] = str(datetime)
            database.execute('DELETE FROM entities WHERE ent_document = ' +
                             str(documents[pos][0]) + ';')
            database.execute('UPDATE documents SET doc_processed = 0,'+\
             ' doc_datetime = \''+str(datetime)+'\','+\
             ' doc_thumbnail = \''+thumbnails[index]+'\','+\
             ' doc_title = \''+titles[index].replace('\'','\'\'')+'\','+\
             ' doc_description = \''+descriptions[index].replace('\'','\'\'')+'\','+\
             ' doc_text = \''+text.replace('\'','\'\'')+'\''+\
             ' WHERE doc_link = \''+links[index]+'\';')
            print('Update - ' + titles[index])
            updated += 1
        else:
            documents.append([
                len(documents) + 1, datetime, links[index],
                [doc_topics[index]]
            ])
            database.execute('INSERT INTO tpc_doc (tpd_topic, tpd_document) VALUES'+\
             ' ('+str(doc_topics[index])+', '+str(documents[-1][0])+');')
            database.execute('INSERT INTO documents (doc_datetime, doc_link, doc_thumbnail,'+\
             ' doc_title, doc_description, doc_text) VALUES (\''+\
             str(datetime)+'\',\''+links[index]+'\',\''+thumbnails[index]+'\',\''+\
             titles[index].replace('\'','\'\'')+'\',\''+\
             descriptions[index].replace('\'','\'\'')+'\',\''+\
             text.replace('\'','\'\'')+'\');')
            print('Insert - ' + titles[index])
            new += 1

        database.commit()

    print new, "new,", updated, "updated."
def retrieve():
	database = connect('database.db')
	topics,feeds,documents,titles,descriptions = [[],[],[],[],[]]
	links,datetimes,thumbnails,doc_topics = [[],[],[],[]]

	### GET DATABASE DATA
	for row in database.execute('SELECT * FROM topics;'):
		topics.append([row[0], str(row[1])])
	for row in database.execute('SELECT fds_topic, fds_link FROM feeds;'):
		feeds.append([row[0],str(row[1])])		
	for row in database.execute('SELECT doc_id, doc_datetime, doc_link FROM documents'):
		documents.append([row[0],str(row[1]),str(row[2]),[]])
		for row2 in database.execute('SELECT tpd_topic FROM tpc_doc WHERE tpd_document = '+str(row[0])+';'):
			documents[-1][3].append(row2[0])

	### GET RSS INFO
	for topic, link in feeds:
		html = urlopen(link).read()
		soup = BeautifulSoup(html)
		items = [item for item in soup.find_all('item')]
		for item in items:
			doc_topics.append(topic)
			if item.title is not None:
				title = item.title.findAll(text=True)
				if len(title) == 1: titles.append(title[0].encode('ascii',errors='ignore'))
				else:               titles.append('')
			if item.description is not None:
				desc = item.description.findAll(text=True)
				if len(desc) == 1: descriptions.append(desc[0].encode('ascii',errors='ignore'))
				else:              descriptions.append('')
			if item.guid is not None:
				link = item.guid.findAll(text=True)
				if len(link) == 1: links.append(link[0].encode('ascii',errors='ignore'))
				else:              links.append('')
			if item.pubdate is not None:
				date = item.pubdate.findAll(text=True)
				if len(date) == 1: datetimes.append(date[0].encode('ascii',errors='ignore'))
				else:              datetimes.append('')
			thumb = item.findChildren('media:thumbnail',{'width':'144'})
			if len(thumb) == 1: thumbnails.append(thumb[0]['url'].encode('ascii',errors='ignore'))
			else:               thumbnails.append('')

	### GET DOCUMENTS
	new = 0
	updated = 0
	for index in range(len(titles)):
		print('('+str(index+1).ljust(4) + str(doc_topics[index]).ljust(2) + ')'),
		
		datetime = parser.parse(datetimes[index])
		try:
			pos = [doc[2] for doc in documents].index(links[index])
		except:
			refresh = 0
		else:
			if doc_topics[index] not in documents[pos][3]:
				database.execute('INSERT INTO tpc_doc (tpd_topic, tpd_document) VALUES'+\
					' ('+str(doc_topics[index])+', '+str(documents[pos][0])+');')
				documents[pos][3].append(doc_topics[index])
				database.commit()
				print('*'),
			if str(datetime) == str(documents[pos][1]):
				print('Unchanged Article')
				continue
			refresh = 1


		not_article = ('VIDEO','AUDIO','In pictures','Your pictures')
		if titles[index].startswith(not_article):
			print('Not an Article')
			continue

		html = urlopen(links[index]).read()
		soup = BeautifulSoup(html)
		title = str(soup.title)[7:-8].decode('utf-8').encode('ascii',errors='ignore')

		temp = ['BBC News','BBC History','BBC Science','BBC Consumer','BBC Arts','BBC Nature']
		if any(i in title for i in temp): division = 'story-body'
		elif 'BBC Sport' in title:        division = 'article'
		elif 'BBC - Capital' in title:    division = 'description|story-body'
		else:                             print('Website not known'); continue

		content = [div for div in soup.find_all('div',{'class':rcompile(division)})]
		soup = BeautifulSoup(' '.join(list(map(str,content))))
		paragraphs = [p for p in soup.findAll('p')]
		soup = BeautifulSoup(' '.join(list(map(str,paragraphs))))
		[p.extract() for p in soup.findAll('p') if str(p).startswith('<p><strong>')]
		[p.extract() for p in soup.findAll('p',{'class':rcompile('disclaimer|terms')})]

		text = soup.get_text().replace('\n',' ').replace('\t',' ').replace('\r',' ')
		text = text.encode('ascii', errors='ignore')
		if text == '':
			print('Empty Text')
			continue

		rsub(' +',' ',text)
		text = text.strip()
		text = '\n'.join([sentence for sentence in sent_tokenize(text)])

		if refresh == 1:
			documents[pos][1] = str(datetime)
			database.execute('DELETE FROM entities WHERE ent_document = '+str(documents[pos][0])+';')
			database.execute('UPDATE documents SET doc_processed = 0,'+\
				' doc_datetime = \''+str(datetime)+'\','+\
				' doc_thumbnail = \''+thumbnails[index]+'\','+\
				' doc_title = \''+titles[index].replace('\'','\'\'')+'\','+\
				' doc_description = \''+descriptions[index].replace('\'','\'\'')+'\','+\
				' doc_text = \''+text.replace('\'','\'\'')+'\''+\
				' WHERE doc_link = \''+links[index]+'\';')
			print('Update - '+titles[index])
			updated += 1
		else:
			documents.append([len(documents)+1, datetime, links[index],[doc_topics[index]]])
			database.execute('INSERT INTO tpc_doc (tpd_topic, tpd_document) VALUES'+\
				' ('+str(doc_topics[index])+', '+str(documents[-1][0])+');')
			database.execute('INSERT INTO documents (doc_datetime, doc_link, doc_thumbnail,'+\
				' doc_title, doc_description, doc_text) VALUES (\''+\
				str(datetime)+'\',\''+links[index]+'\',\''+thumbnails[index]+'\',\''+\
				titles[index].replace('\'','\'\'')+'\',\''+\
				descriptions[index].replace('\'','\'\'')+'\',\''+\
				text.replace('\'','\'\'')+'\');')
			print('Insert - '+titles[index])
			new += 1

		database.commit()
		
	print new,"new,", updated,"updated."
示例#17
0
文件: train.py 项目: muraj/BotMuraj
#!/bin/python
import sys
from re import sub as rsub
import WordSub, DefaultSubs
"""TODO: Bug with multi-sentence pattern/that/template(s)"""
if __name__ == "__main__":
	xml=['<?xml version=\'1.0\' encoding=\'ISO-8859-1\'?>','<aiml>']
	that=rsub(r'<[a-zA-Z\/][^>]*>','',sys.argv[1].upper())
	pattern=rsub(r'<[a-zA-Z\/][^>]*>','',sys.argv[2].upper())
	template=rsub(r'<[a-zA-Z\/][^>]*>','',sys.argv[3])

	subbers={}	#Do aiml default substitutions for best match.
	subbers['gender']=WordSub.WordSub(DefaultSubs.defaultGender)
	subbers['person']=WordSub.WordSub(DefaultSubs.defaultPerson)
	subbers['person2']=WordSub.WordSub(DefaultSubs.defaultPerson2)
	subbers['normal']=WordSub.WordSub(DefaultSubs.defaultNormal)
	for sub in subbers:
		that=subbers[sub].sub(that)
		pattern=subbers[sub].sub(pattern)

	xml.append('<category>')
	xml.append("<pattern>%s</pattern>" % pattern)
	if not that=='':
		xml.append("<that>%s</that>" % that)
	xml.append("<template>%s</template>" % template)
	xml.append('</category>')
	
	xml.append('</aiml>')
	f=open('learning.aiml','w')
	f.write('\n'.join(xml))
	f.flush()
示例#18
0
def underscore(what):
    return rsub('([A-Z]+)', r'_\1', rsub('([A-Z]+)([A-Z][a-z])', r'_\1_\2',
                what)).lstrip('_').lower()