Exemplo n.º 1
0
    def string(html, cssSelector='*') -> str:
        if isinstance(html, str):
            return strip_markup(html).strip()

        if not isinstance(html, Selector):
            html = Selector(html)

        data = html.css(cssSelector).extract_first()
        if data is None:
            data = str('')

        data = strip_markup(data)
        return data.strip()
    def getRepos(self, name = None):

        yield self.header()

        if not name:
            yield '<p>Please actually enter a username.</p>'
        else:
            name = strip_markup(name)
            try:
                url = "https://api.github.com/users/%s/repos" % name
                req = urllib2.Request(url)
                opener = urllib2.build_opener()
                f = opener.open(req)
                data = json.load(f)
                gotten = True
            except urllib2.URLError, e:
                if '404' in str(e): yield "<p>Something went wrong, you likely didn't type in the username correctly.</p>"
                elif '403' in str(e): yield "<p>Something went wrong, you've likely gone beyond the GitHub API request limits.</p>"
                else: yield "<p>Something went wrong: " + str(e) + "</p>"
                gotten = False

            if(gotten):
                if(len(data) != 0):
                    final_data = "<p> The user %s has these public repos: <br /> <ul>" % name
                    for i in data:
                        final_data += "<li>" + i['name'] + "</li>"
                    final_data += "</ul></p>"
                    yield final_data
                else:
                    yield "<p>This user doesn't currently have any repos.</p>"
Exemplo n.º 3
0
	def post(self):
		title = strip_markup(self.get_argument("title"))
		tag = strip_markup(self.get_argument("tag"))
		length = strip_markup(self.get_argument("length"))
		dmy = strip_markup(self.get_argument("start_date"))
		hour = strip_markup(self.get_argument("hour"))
		ssub = strip_markup(self.get_argument("short_description"))
		lsub = strip_markup(self.get_argument("long_description"))
		user = self.current_user
		ch_id = uuid.uuid4()
		url = uuid_to_url(ch_id)

		length = int(length)
		hour = ''.join([dmy, " ", hour])
		dmy = ''.join([dmy, " ", "00:00"])
		datetime_dmy = datetime.datetime.strptime(dmy, '%Y-%m-%d %H:%M')
		datetime_start = datetime.datetime.strptime(hour, '%Y-%m-%d %H:%M')

		utc = pytz.timezone('UTC')
		datetime_dmy = utc.localize(datetime_dmy)
		datetime_start = utc.localize(datetime_start)

		#timestamp will just use cassandra getdate(now())
		# Need to insert into all channel column families, see: db_sechma for columns
		yield gen.Task(create_channel.apply_async, args=[title, tag, length, datetime_dmy, datetime_start, user, ch_id, url, ssub, lsub])

		self.redirect("/ch/%s" % url)
Exemplo n.º 4
0
def phone_validator(node, value):
    """ checks to make sure that the value looks like a phone number """
    value = htmllaundry.strip_markup(value)
    allowed = set(string.ascii_lowercase + string.digits + ' ' + '.' + '+' +
                  '(' + ')' + '-')
    tval = set(value) <= allowed

    if value is u'':
        raise colander.Invalid(node, 'Please provide a valid phone number')
    if not tval:
        raise colander.Invalid(
            node, '%s is not a valid telephone number format' % value)
Exemplo n.º 5
0
def phone_validator(node, value):
    """ checks to make sure that the value looks like a phone number """
    value = htmllaundry.strip_markup(value)
    allowed = set(string.ascii_lowercase + string.digits + ' ' + '.' + '+' + '(' + ')' + '-')
    tval = set(value) <= allowed

    if value is u'':
        raise colander.Invalid(node,
               'Please provide a valid phone number')
    if not tval:
        raise colander.Invalid(node,
               '%s is not a valid telephone number format' % value)
Exemplo n.º 6
0
def html_cleaner(html):
    soup = BeautifulSoup('\n'.join(html))
    [s.extract() for s in soup('script')]  # remove 'script', 'style', 'option' tags
    [s.extract() for s in soup('style')]
    [s.extract() for s in soup('option')]

    cleaned_sents = htmllaundry.strip_markup(str(soup))  # leave only text

    # remove continuous empty lines
    cleaned_sents = re.sub(r'\n\s*\n+', '\n\n', cleaned_sents).strip()
    cleaned_sents = re.sub(r'\s+', ' ', cleaned_sents, re.M).strip()  # remove continuous spaces

    return cleaned_sents
Exemplo n.º 7
0
def html_cleaner(html):
    soup = BeautifulSoup('\n'.join(html))
    [s.extract()
     for s in soup('script')]  # remove 'script', 'style', 'option' tags
    [s.extract() for s in soup('style')]
    [s.extract() for s in soup('option')]

    cleaned_sents = htmllaundry.strip_markup(str(soup))  # leave only text

    # remove continuous empty lines
    cleaned_sents = re.sub(r'\n\s*\n+', '\n\n', cleaned_sents).strip()
    cleaned_sents = re.sub(r'\s+', ' ', cleaned_sents,
                           re.M).strip()  # remove continuous spaces

    return cleaned_sents
Exemplo n.º 8
0
 def cleanup(self):
     a = self.transformable
     a['title'] = html.unescape(
         a['title']
     ).replace('\'\'', '\'')
     try:
         a['intro'] = strip_markup(
             html.unescape(a['intro'])
         )
     except:
         pass
     a['authors'] = a['authors'].split(',')
     for idx in a['authors']:
         # a['authors'][idx] = int(a['authors'][idx])
         idx = int(idx)
     try:
         a['dossier'] = int(float(a['dossier']))
     except:
         a['dossier'] = None
     return a
Exemplo n.º 9
0
def preProcessingData(file_name):
    data_frame = dp.read_csv(file_name)

    # Data Tokenization
    list_tokenization = [
        strip_markup(clean_text).split(" ")
        for clean_text in data_frame['Body']
    ]

    # Lower case conversion and Removal of Stop words
    data_tokenization_lower = []
    for tokenization in list_tokenization:
        tokenization_lower = []
        for x in tokenization:
            x_lower = x.lower().replace('\n', ' ')
            if x_lower not in stopwords.words('english'):
                tokenization_lower.append(x_lower)

        data_tokenization_lower.append(tokenization_lower)

    data_frame['Body'] = data_tokenization_lower
    return data_frame
Exemplo n.º 10
0
    async def mal_cmd(self,ctx,type,*,name):
        if type.lower() == "anime":
            query = '''
            query ($id: Int, $page: Int, $perPage: Int, $search: String) {
                Page (page: $page, perPage: $perPage) {
                    pageInfo {
                        total
                        currentPage
                        lastPage
                        hasNextPage
                        perPage
                    }
                    media (id: $id, search: $search, type: ANIME) {
                        id
                        title {
                        romaji
                        english
                        native
                        }
                        status
                        startDate{
                        year
                        month
                        day
                        }
                        episodes
                        format
                        coverImage{
                        large
                        }
                        bannerImage
                        siteUrl
                        source
                        type
                        averageScore
                        meanScore
                        description
                    }
                }
            }
            '''
            variables = {
                'search': name,
                'page': 1,
                'perPage': 3
            }
            url = 'https://graphql.anilist.co'

            async with aiohttp.ClientSession() as session:
                async with session.post(url,json={'query':query,'variables':variables}) as response:
                    responsed=await response.json()
                    page=responsed['data']['Page']
                    media=page['media']
                    result=media[0]
                    embed_obj=discord.Embed(colour=discord.Colour.red(),url=result['siteUrl'])
                    embed_obj.set_thumbnail(url=result['coverImage']['large'])
                    embed_obj.set_image(url=result['bannerImage'])
                    if result['title']['english']=="None":
                        embed_obj.title=f"{result['title']['romaji']}({result['title']['native']})"
                    else:
                        embed_obj.title=f"{result['title']['romaji']}({result['title']['native']})"
                    embed_obj.add_field(name="Status:",value=result['status'])
                    embed_obj.add_field(name='Started at:',value=f"{result['startDate']['year']}/{result['startDate']['month']}/{result['startDate']['day']}")
                    embed_obj.add_field(name="Episodes:",value=result['episodes'])
                    embed_obj.add_field(name="Type:",value=result['type'])
                    embed_obj.add_field(name="Average score:",value=result['averageScore'])
                    embed_obj.add_field(name="Mean score:",value=result['meanScore'])
                    if len(result['description'])>1024:
                        await(ctx.send("Synopsis is more than 1024 characters.Wanna me to send synopsis here?\n*Yes/No*\n*Timeout set to 20 seconds.*"))    
                    else:
                        embed_obj.add_field(name='Synopsis:',value=strip_markup(result['description']),inline=True)
                    await ctx.send(embed=embed_obj)
                    def check(message):
                        if message.content.lower()=="yes" or message.content.lower()=="y":
                            return True
                        elif message.content.lower()=="no" or message.content.lower()=="n":
                            return False
                    await self.client.wait_for('message',timeout=20,check=check)
                    description=strip_markup(result['description'])
                    await ctx.send(description)

        if type.lower() == "manga":
            query = '''
            query ($id: Int, $page: Int, $perPage: Int, $search: String) {
                Page (page: $page, perPage: $perPage) {
                    pageInfo {
                        total
                        currentPage
                        lastPage
                        hasNextPage
                        perPage
                    }
                    media (id: $id, search: $search, type: MANGA) {
                        id
                        title {
                        romaji
                        english
                        native
                        }
                        status
                        startDate{
                        year
                        month
                        day
                        }
                        episodes
                        format
                        coverImage{
                        large
                        }
                        bannerImage
                        siteUrl
                        source
                        type
                        averageScore
                        meanScore
                        description
                    }
                }
            }
            '''
            variables = {
                'search': name,
                'page': 1,
                'perPage': 3
            }
            url = 'https://graphql.anilist.co'

            async with aiohttp.ClientSession() as session:
                async with session.post(url,json={'query':query,'variables':variables}) as response:
                    responsed=await response.json()
                    page=responsed['data']['Page']
                    media=page['media']
                    result=media[0]
                    embed_obj=discord.Embed(colour=discord.Colour.red(),url=result['siteUrl'])
                    embed_obj.set_thumbnail(url=result['coverImage']['large'])
                    if result['title']['english']=="None":
                        embed_obj.title=f"{result['title']['romaji']}({result['title']['native']})"
                    else:
                        embed_obj.title=f"{result['title']['romaji']}({result['title']['native']})"
                    embed_obj.add_field(name="Status:",value=result['status'])
                    embed_obj.add_field(name='Started at:',value=f"{result['startDate']['year']}/{result['startDate']['month']}/{result['startDate']['day']}")
                    embed_obj.add_field(name="Episodes:",value=result['episodes'])
                    embed_obj.add_field(name="Type:",value=result['type'])
                    embed_obj.add_field(name="Average score:",value=result['averageScore'])
                    embed_obj.add_field(name="Mean score:",value=result['meanScore'])
                    if len(result['description'])>1024:
                        await(ctx.send("Synopsis is more than 1024 characters.Wanna me to send synopsis here?\n*Yes/No*\n*Timeout set to 20 seconds.*"))    
                    else:
                        embed_obj.add_field(name='Synopsis:',value=strip_markup(result['description']),inline=True)
                    await ctx.send(embed=embed_obj)
                    def check(message):
                        if message.content.lower()=="yes" or message.content.lower()=="y":
                            return True
                        elif message.content.lower()=="no" or message.content.lower()=="n":
                            return False
                    await self.client.wait_for('message',timeout=20,check=check)
                    description=strip_markup(result['description'])
                    await ctx.send(description)
    def clean_my_file(self, x):

        #  preprocess the text
        # print x

        # get rid of newlines, tabs and carriage returns.
        x = re.sub('\r', '', x)
        x = re.sub('\t', '', x)
        x = re.sub('\n', '', x)

        # some of the blog posts have various html code elements in it's undecoded form,
        # some don't, we want to make sure that we get rid of all html code. That is why
        # we decode the most common html characters.

        # replace all linked content with [URL]
        # we will use the linked content in one of our features.
        x = re.sub('<[aA] (href|HREF)=.*?</[aA]>;?', ' URL ',
                   x)  # replace urls
        x = re.sub('<img.*?>;?', ' URL ', x)  # replace urls
        x = re.sub('(http|https|ftp)://?[0-9a-zA-Z\.\/\-\_\?\:\=]*', ' URL ',
                   x)
        x = re.sub('(http|https|ftp)://?[0-9a-zA-Z\.\/\-\_\?\:\=]*', ' URL ',
                   x)
        x = re.sub('(^|\s)www\..+?(\s|$)', ' URL ', x)

        x = re.sub('(^|\s)(http|https|ftp)\:\/\/t\.co\/.+?(\s|$)', ' URL ', x)
        x = re.sub('(^|\s)(http|https|ftp)\:\/\/.+?(\s|$)', ' URL ', x)
        x = re.sub('(^|\s)pic.twitter.com/.+?(\s|$)', ' URL ', x)

        # clean all the HTML markups, this function is a part of htmllaundry
        x = strip_markup(x)

        # get rid of bbcode formatting and remaining html markups
        x = re.sub('[\[\<]\/?b[\]\>];?', '', x)
        x = re.sub('[\[\<]\/?i[\]\>];?', '', x)
        x = re.sub('[\[\<]br [\]\>];?', '', x)
        x = re.sub('/>', '', x)
        x = re.sub('[\<\[]\/?h[1-4][\>\]]\;?', '', x)
        x = re.sub('\[\/?img\]', '', x)
        x = re.sub('\[\/?url\=?\]?', '', x)
        x = re.sub('\[/?nickname\]', '', x)
        # x = re.sub(';{1,}',' ', x)

        # get rid of whitespaces
        x = re.sub(' {1,}', ' ', x)
        x = self.h.unescape(x)

        # delete everything else that strip_markup doesn't
        x = re.sub('height=".*?"', '', x)
        x = re.sub('width=".*?"', '', x)
        x = re.sub('alt=".*?"', '', x)
        x = re.sub('title=".*?"', '', x)
        x = re.sub('border=".*?"', '', x)
        x = re.sub('align=".*?', '', x)
        x = re.sub('style=".*?"', '', x)
        x = re.sub(' otted  border-color:.*?"', '', x)
        x = re.sub(' ashed  border-color:.*?"', '', x)
        x = re.sub('target="_blank">', '', x)
        x = re.sub('<a target=" _new"  href="  ]', '', x)
        x = re.sub('<a target="_new" rel="nofollow" href=" ]', '', x)

        # users for tweeter
        x = re.sub('(^|\s)@(?!\s).+?(?=(\s|$))', ' USER ', x)
        x = x.strip().lstrip()

        # print x
        return x
Exemplo n.º 12
0
def remove_html_tags(word):
    if word is None:
        return ""

    return strip_markup(word)
Exemplo n.º 13
0
 def pre_validate(self, form):
     self.data = htmllaundry.strip_markup(self.data)
Exemplo n.º 14
0
def Getterms(content, lang, prods, returnJSON):
	global __debug_on__, bad_stemmer_1, bad_stemmer_3, bad_stemmer_4
	bad_stemmer_1 = bad_stemmer_3 = bad_stemmer_4 = 0

	Service.logger.debug("Started processing for %d segments for language %s" % (len(content), lang))

	
	new_content = set()
	new_content_orig = ""
	new_content_orig_tok = set()
	
	for seg in set(content):
		# Mask { and } as they clash with the inner workings of the chunker
		seg = seg.replace(u"{",u"﹛").replace(u"}",u"﹜")

		# Treating UI strings containing \r escapes # Treating UI strings containing \n escapes # Collapsing new lines # Clean-up the line endings—not sure if useful at all
		seg = seg.replace('\\r','\r').replace('\\n','\n').replace('\r\n','\n').replace('\n ','\n')
		new_content_orig += " " + seg
		for word in word_tok.tokenize(seg):
			new_content_orig_tok.add(word)

		seg = seg.replace('%','').replace(".. ."," ...").replace('<openparen>','(').replace('<closeparen>',')').replace("&apos;","'").replace('&quot;', '"').replace('&amp;', '&').replace('&lt;', '<').replace('&gt;', '>')

	# Do the following even occur in our data?
	#	new_content = new_content.replace('&circ;', '^')
	#	new_content = new_content.replace('&tilde;', '~')
	#	new_content = new_content.replace('&ndash;', '–')
	#	new_content = new_content.replace('&mdash;', '—')
	#	new_content = new_content.replace('&lsquo;', '‘')
	#	new_content = new_content.replace('&rsquo;', '’')
	#	new_content = new_content.replace('&sbquo;', ',')
	#	new_content = new_content.replace('&ldquo;', '“')
	#	new_content = new_content.replace('&rdquo;', '”')
	#	new_content = new_content.replace('&bdquo;', '"')
	#	new_content = new_content.replace('&permil;', '‰')
	#	new_content = new_content.replace('&euro;', '€')
	
		# Strip HTML/XML markup
		seg = strip_markup(seg).replace('\\t','\n')

		# Some very crude pre-tokenisation
		seg = seg.replace(':',' :').replace('\t','\n').replace("\\", '\n').replace('|','\n')


		# It’s not quite clear what this is supposed to do
		seg = seg.replace("&","").replace("[\\w]+_[\\w]+","\n").replace("[\\w]+_","\n").replace("_[\\w]+","\n")
		
#		Service.logger.debug("Segment: " + seg)
		for seg in sent_tokenizer.tokenize(seg):
			# Unmask { and }
			seg = seg.replace(u"﹛",u"{").replace(u"﹜",u"}")
			new_content.add(seg)
	

	if __debug_on__:
#		Service.logger.debug("Finished sentence segmentation.")
		Service.logger.debug("Finished character-level pre-processing.")

	# word tokenize the sentences
	new_content_tokenized = [word_tok.tokenize(line) for line in new_content]

	if __debug_on__:
		Service.logger.debug("Finished main tokenisation.")

	# Remove empty lines	
	# Remove one-word lines which are all caps. Typically: commands.	
	for l in new_content_tokenized:
		if len(l)==0 or (len(l)==1 and str(l).isupper()):
			new_content_tokenized.remove(l)
		else:
	# Remove placeholders (note: any modification can be made to the text here, as the final output will be verified in the original content)   
	# (this doesn’t really make any sense to me —— V.)
			for t in l:
				if '%' in t:
					l.remove(t)
   
	# POS tag sentences
	tagged_sent = pos_tagger.tag_sents(new_content_tokenized)

	if __debug_on__:
		Service.logger.debug("Finished main POS tagging.")


	# [Issue not repro since default tagger is added] I leave it in, in any case. None tags are extremely rare, deleting these segments results in minimal loss. 
	tagged_sent = [r for r in tagged_sent if ("', None), ('" not in str(r)) and ("', None" not in str(r)) and ("', ''" not in str(r))]


	# Define chunkers (left in 'Unk'/'UNK' as POS-tags for unknown words in the chunker definition, but the tagger uses 'NN'.) 
	def GetSurfaceChunksByStem(sentences):
		global bad_stemmer_1
		grammar = (r'''CHUNK: {(<Unk|UNK|NN.*|VBN>*)(<JJ.*|VBN>*)(<Unk|UNK|NN.*|VBN>)(<Unk|UNK|NN.*>+)}''')
		cp = nltk.RegexpParser(grammar)
		chunks = set()
		for sent in sentences:
			try:
				tree = cp.parse(sent)
				for subtree in tree.subtrees():
					if subtree.label() == 'CHUNK':
						chunks.add(' '.join([l[0] for l in subtree.leaves()]))
			except:
				bad_stemmer_1 += 1
				Service.logger.debug("Issues with chunker 1!".encode('utf-8'))
				Service.logger.debug(Service.traceback.format_exc())
				# Service.logger.debug("1+")
				# Service.logger.debug("Bad stemmer 1: "+str(sent))
		return chunks

	# This chunker extracts units like "Elements limiting slenderness"
	def GetSurfaceChunksByStem3(sentences):
		global bad_stemmer_3
		grammar = (r'''CHUNK: {<Unk|UNK|NN.*|VBN> <VBG> <Unk|UNK|NN.*>}''')
		cp = nltk.RegexpParser(grammar)
		chunks = set()
		for sent in sentences:
			try:
				tree = cp.parse(sent)
				for subtree in tree.subtrees():
					if subtree.label() == 'CHUNK':
						chunks.add(' '.join([l[0] for l in subtree.leaves()]))
			except:
				bad_stemmer_3 += 1
				Service.logger.debug("Issues with chunker 3!".encode('utf-8'))
				Service.logger.debug(Service.traceback.format_exc())
				# Service.logger.debug("3+")
		return chunks

	# Chunker for single word noun-like units 
	def GetNouns(sentences):
		global bad_stemmer_4
		grammar = (r'''CHUNK: {<Unk|UNK|NN.*|VBN|JJ.*>}''')
		cp = nltk.RegexpParser(grammar)
		chunks = set()
		for sent in sentences:
			try:
				tree = cp.parse(sent)
				for subtree in tree.subtrees():
					if subtree.label() == 'CHUNK':
						chunks.add(' '.join([l[0] for l in subtree.leaves()]))
			except:
				bad_stemmer_4 += 1
				Service.logger.debug("Issues with chunker 4!".encode('utf-8'))
				Service.logger.debug(Service.traceback.format_exc())
				# Service.logger.debug("4+")
		return chunks
	
	# Get compound chunks extracted   
	# AND Remove duplicate chunks
	new_chunks = GetSurfaceChunksByStem(tagged_sent).union(GetSurfaceChunksByStem3(tagged_sent))

	if __debug_on__:
		Service.logger.debug("Finished main chunking.")
		Service.logger.debug((u"Skipped bad parses as follows: 1=" + str(bad_stemmer_1) + u" 3=" + str(bad_stemmer_3) + u" 4=" + str(bad_stemmer_4)).encode('utf-8'))



	# Correct chunks (Some corrections aren't repro, because they were added for a different tokenizer.
	# They don't hurt to have - I leave them in.)	

	#  Maybe these characters should be removed from the beginning...?
	not_needed = ['.', '^', "'", "\\", "/", "!", '_', '%', "=", '*', '>', '<', '\\', ":", "|"]

	new_compounds = set()
	for w in new_chunks:
	# [Issue not repro.] Remove '@' from multi-word units.	
		w = w.replace('@', '')
	# [Issue not repro.] Remove '*' from the multi-word units.
		w = w.replace('*', '')
	# [Issue not repro.] Remove '.' from the end of multi-word units.	
	# [Issue not repro.] Remove ',' from the end of multi-word units.	
		w.rstrip(".,")
	# Correct issue deriving from tokenization	
		w = w.replace(" 's", "'s")
	# Get rid of words containing '+' in chunks (for sw strings).
	# Eg: 'Ctrl+A key combination' will become 'key combination'
		if '+' in w:
			tok = word_tok.tokenize(w)
			for i in tok:
				if  "+" in i:
					w = w.replace(i, '')
	# [Issue not repro.] Remove '=' from multi-word units.
		w = w.replace('=', '')
	# [Issue not repro.] Remove double spaces from multi-word units.
		w = w.replace('  ', ' ')
	# [Issue not repro.] Remove space from the end of multi-word units.
	# [Issue not repro.] Remove space from the beginning of multi-word units.
		w.strip()
	
	# remove one letter words from the chunk units (eg. remains of placeholders)
	# switched to removing the whole chunmk if a one-letter word was found
		noWordFound = False
		for word in word_tok.tokenize(w):
			if word in nowords or len(word) == 1:
				noWordFound = True
				break
		if not noWordFound:
			for mark in not_needed:
				if mark in w:
					noWordFound = True
					break
			if not noWordFound:
				new_compounds.add(w)
  

	if __debug_on__:
		Service.logger.debug("Finished first chunk cleanup.")

	
	# extract noun(like) units
	nouns = [w for w in GetNouns(tagged_sent) if w.isdigit() == False]


	# clean results up from (untranslatable) characters, nowords content and check if they are in the original content as-is
	new_nouns = set()
	for n in nouns:
		not_needed_found = False
		for mark in not_needed:
			if mark in n:
				not_needed_found = True
				break
		if not not_needed_found:
			new_nouns.add(n)
	
	new_nouns = set([w.lower() for w in new_nouns if (w.lower() not in nowords) and (w in new_content_orig_tok)])

	if __debug_on__:
		Service.logger.debug("Finished noun selection.")

	   
	
	# Compounds: new_compounds
	# Single words: new_nouns
	# Create one group of all chunks
	# check back if extracted term candidates are in the original text as well
	new_words_and_compounds = [w for w in new_compounds.union(new_nouns) if w in new_content_orig]

	
	if __debug_on__:
		Service.logger.debug("Starting substring cleanup for " + str(len(new_words_and_compounds)) + " chunks")
	# remove multi-word chunks that are compouns of smaller multi-word chunks. For example, 'calculation configuration' and
	# 'dialog box' remains, but 'calculation configuration dialog box' will be removed
	tempSet = set()
	new_chunks_set = set([_.lower() for _ in new_words_and_compounds])
	new_chunks_temp = sorted(new_chunks_set, key=cmp_to_key(locale.strcoll))
	counter = 0
	for i in range(0, len(new_chunks_temp)):
		for j in range(i, len(new_chunks_temp)):
			if __debug_on__:
				counter += 1
				if not counter % 10000:
					Service.logger.debug(".")
				if not counter % 500000:
					Service.logger.debug(str(counter))
			nc = new_chunks_temp[i] + ' ' + new_chunks_temp[j]
			if nc in new_chunks_set:
				# Service.logger.debug("found superstring " + nc)
				tempSet.add(nc)
			nc = new_chunks_temp[j] + ' ' + new_chunks_temp[i]
			if nc in new_chunks_set:
				# Service.logger.debug("found superstring " + nc)
				tempSet.add(nc)
	# Word tokenize filtered multi-word units.
	new_words_and_compounds = [w for w in new_chunks_temp if w not in tempSet]

	if __debug_on__:
		Service.logger.debug("Finished chunk substring cleanup.")
		

	# Query NeXLT for existing translation
	if __debug_on__:
		Service.logger.debug("Running NeXLT queries for " + str(len(new_words_and_compounds)) + " chunks...")
	
	def QueryNeXLT(term, language, prod_name):
		r = None
		if Service.isStaging:
			r = requests.get("http://aws.stg.solr:8983/search/select/?wt=json&start=0&rows=1&q=enu%3A%22" + term + "%22%20AND%20product:"  + prod_name +  "%20AND%20" + language + ":['' TO *]")
		else:
			r = requests.get("http://aws.prd.solr:8983/search/select/?wt=json&start=0&rows=1&q=enu%3A%22" + term + "%22%20AND%20product:"  + prod_name +  "%20AND%20" + language + ":['' TO *]")
		r.encoding = "utf-8"
		try:
			response = r.json()['response']['numFound']
		except:
			response = 0
		return response
					   
	
	def QueryNeXLTAllProds(term, language):
		r = None
		if Service.isStaging:
			r = requests.get("http://aws.stg.solr:8983/search/select/?wt=json&start=0&rows=1&q=enu%3A%22" + term + "%22%20AND%20product:"  + '*' +  "%20AND%20" + language + ":['' TO *]")
		else:
			r = requests.get("http://aws.prd.solr:8983/search/select/?wt=json&start=0&rows=1&q=enu%3A%22" + term + "%22%20AND%20product:"  + '*' +  "%20AND%20" + language + ":['' TO *]")
		r.encoding = "utf-8"
		try:
			response = r.json()['response']['numFound']
		except:
			response = 0
		return response


	new_words_and_compounds_in_product = []

	for t in new_words_and_compounds:
		newTerm = True
		for prod_name in prods:
			newTerm = newTerm and QueryNeXLT(t.lower(), lang, prod_name) == 0
		if newTerm:
			new_words_and_compounds_in_product.append(t)

   
	# product independent query to NeXLT
	# append context and product/corpus information + number of occurrences
	terms = []

	for term in new_words_and_compounds_in_product:
		# Unmask { and }
		term = term.replace(u"﹛",u"{").replace(u"﹜",u"}")
		contexts = [con for con in new_content if term.lower() in con.lower()]
		# Skip any terms that cannot be found in the original source. We cannot provide terms with no context to translators.
		if len(contexts) == 0:
			continue
			Service.logger.error(u"Could not find original context for term %s!" % term)
		if QueryNeXLTAllProds(term.lower(), lang) == 0:
			terms.append([term, "Corpus", contexts, len(contexts), len(term)])
		else:
			terms.append([term, "Product", contexts, len(contexts), len(term)])
				


	if __debug_on__:
		Service.logger.debug("Finished NeXLT calls with %s new terms remaining." % len(terms))

		 
	if returnJSON:
		# Sort final term list and create json format
		terms = sorted(terms, key=itemgetter(1,0), reverse=True)
	
		terms_for_json = {}

		for listitem in terms:
			if prods[0] == "NEW_PRODUCT":
				k = {listitem[0]: {'newto': "New product, search in corpus only", 'context':  listitem[2], 'numContextSents': listitem[3]}}
			else:
				k = {listitem[0]: {'newto': listitem[1], 'context':  listitem[2], 'numContextSents': listitem[3]}}
			terms_for_json.update(k)

  
		if __debug_on__:
			Service.logger.debug("Finished final processing.")

		  
		return terms_for_json
		
	else:
		if __debug_on__:
			Service.logger.debug("Finished final processing.")
			Service.logger.debug("Extracted terms:")
#			import pprint
#			for term in terms:
#				Service.logger.debug("\tterm: %s" % pprint.saferepr(term))

		return terms
Exemplo n.º 15
0
def remove_html_tags(word):
    if word is None:
        return ""

    return strip_markup(word)
Exemplo n.º 16
0
 def pre_validate(self, form):
     if self.data and self.data != '':
         self.data = htmllaundry.strip_markup(self.data)
     else:
         self.data = None