Exemplo n.º 1
0
def add_lang(item):
    """Guess language for item"""
    lang = guess_language.guessLanguage(item['institution_description'])
    if lang == 'UNKNOWN':
        lang = guess_language.guessLanguage(item['job_functions_list'])

    if lang == 'UNKNOWN':
        logging.info('No language found for item %s' % item['phid'])
        return {}
    item['lang'] = lang
    return item
Exemplo n.º 2
0
def index(page = 1):
	
	form = PostForm()

	if form.validate_on_submit():

		language = guessLanguage(form.post.data)
		if language == 'UNKNOWN' or len(lanaguage) >5:
			language = ''

		post = Post(body = form.post.data, 
			timestamp = datetime.utcnow(), 
			author = g.user,
			language = language)

		db.session.add(post)
		db.session.commit()

		flash(gettext('Your post is now live!'))
		return redirect(url_for('index'))


	posts = g.user.followed_posts().paginate(page, POST_PER_PAGE, False)
	

	return render_template("index.html", 
		title = "Home",
		form  = form, 
		posts = posts)
Exemplo n.º 3
0
def post_prayer():
    form = PostForm()
    group_forms = [(group, GroupPost(prefix = str(group.id))) for group in g.user.groups]
    if form.validate_on_submit():
        language = guessLanguage(form.post.data)
        if language == 'UNKNOWN' or len(language) > 5:
            language = ''
        post = Post(subject = form.subject.data,
            body = form.post.data,
            timestamp = datetime.utcnow(),
            author = g.user,
            language = language,
            public = form.public.data)
        db.session.add(post)
        db.session.commit()
        if not post.public:
            # Filter only group that were selected.
            add_groups = filter(lambda g: g[1].group_access.data == True, group_forms)
            for group in add_groups:
                group[0].add_post(post)
                db.session.add(group[0])
            db.session.commit()   
        flash(gettext('Your post is now live!'))
        return redirect(url_for('post', id = post.id))
    return render_template('post_form.html',
        title = 'Post Prayer',
        form = form,
        group_forms = group_forms)
Exemplo n.º 4
0
def edit_post(id, page = 1):
    post = Post.query.get(id)
    if post == None:
        flash('Post not found.')
        return redirect(url_for('index'))
    if post.author.id != g.user.id:
        flash('You cannot edit this post.')
        return redirect(url_for('index'))
    form = PostForm()
    if form.validate_on_submit():
        language = guessLanguage(form.post.data)
        if language == 'UNKNOWN' or len(language) > 5:
            language = ''
        post.subject = form.subject.data
        post.body = form.post.data
        post.language = language
        db.session.add(post)
        db.session.commit()
        flash(gettext('Your post has been updated!'))
        return redirect(url_for('index'))
    elif request.method != "POST":
        form.subject.data = post.subject
        form.post.data = post.body
    comments = post.comments.order_by(Comment.timestamp.desc()).paginate(page, POSTS_PER_PAGE, False)
    return render_template('edit_post.html',
        post = post,
        form = form,
        comments = comments)
Exemplo n.º 5
0
def pack(args):
    if args.type == 'unknown':
        print >> sys.stderr, "guess mime type from file"
        mimetype = mimetypes.guess_type(args.input.name, strict=False)[0]
        if mimetype is not None: args.type = mimetype
    if args.name is None:
        args.name = args.input.name if args.input.name != "<stdin>" else ""
        
    data = args.input.read()

    if args.type == "text/plain":
        print >> sys.stderr, "text/plain ==> urn:nfc:wkt:T"
        try:
            from guess_language import guessLanguage
            print >> sys.stderr, "guess language from text"
            language = guessLanguage(data)
            if language == "UNKNOWN": language = "en"
        except ImportError:
            language = "en"
        print >> sys.stderr, "text language is '%s'" % language
        record = nfc.ndef.TextRecord(data, language=language)
        record.name = args.name
    else:
        print >> sys.stderr, "mime type is %s" % args.type
        record = nfc.ndef.Record(args.type, args.name, data)

    message = nfc.ndef.Message(record)
    if args.outfile.name == "<stdout>":
        args.outfile.write(str(message).encode("hex"))
    else:
        args.outfile.write(str(message))
Exemplo n.º 6
0
def index(page=1):
    # user = g.user  # pass g.user to template, not fake user
    form = PostForm()
    if form.validate_on_submit():  # if the form is submitted, insert new Post record into db.
        language = guessLanguage(form.post.data)
        if language == 'UNKNOWN' or len(language) > 5:
            language = ''  # empty string signals unknown lang
        post = Post(body = form.post.data,
                    timestamp = datetime.utcnow(),
                    author=g.user,
                    language = language)
        db.session.add(post)
        db.session.commit()
        flash("Your post is now live!")
        return redirect(url_for('index'))
    #  could have skipped the redirect and allowed funct to continue down into rendering. So why redir?
    #  If user hits refresh, browser would resend last issued request - resulting in a double post.
    #  redir forces browser to issue another request after submision, so get the home page, not resubmit.

    #posts = g.user.followed_posts().all()  # returns sqlalch query obj, config'd to grab posts we are interested in.
    #  .all() puts all the posts into a list
    posts = g.user.followed_posts().paginate(page,POSTS_PER_PAGE,False)
    # paginate(starting page #, num items/page, error flag).items attribute of pagination obj is a list of items

    return render_template('index.html',
                           title="Home",
                           form=form,
                           posts=posts)
Exemplo n.º 7
0
def index(page = 1):
#    user = {'nickname': 'Ang'} # fake user
    form = PostForm()
    if form.validate_on_submit():
        language = guessLanguage(form.post.data)
        if language == 'UNKNOWN' or len(language) > 5:
            language = ''
        post = Post(body = form.post.data, timestamp = datetime.utcnow(),
                author= g.user, language = language)
        db.session.add(post)
        db.session.commit()
        flash('Your post is now live!')
        return redirect(url_for('index'))
#    posts = g.user.followed_posts().all()
    posts = g.user.followed_posts().paginate(page, POSTS_PER_PAGE, False)
#    posts = [ # fake array of posts
#              {
#                'author' : {'nickname' : 'Ang'},   
#                'body' : 'Beautiful day in Ireland!'
#              }, 
#              {
#                'author' : {'nickname' : 'Gao'},   
#                'body' : 'Hacking a blog'
#              } 
#            ]
    return render_template('index.html', 
            title = 'Home', 
            form = form,
            posts=posts)
def event_analysis_fulfill_corpus(event_analysis, websites, description_tree_tagger, website_tree_tagger, events):
    """
    Part 1 of the event analysis, that fulfill the corpus
    """
    tagger = TreeTagger()
    # We complete the corpus with plain text of description & website if exists
    for e in events:
        len_description = 0
        if e.description != '' and guess_language.guessLanguage(e.description.encode('utf-8')) == LANGUAGE_FOR_TEXT_ANALYSIS:
            event_analysis.add_document_in_corpus(e.description, EventAnalysis.get_id_website(e.id, False))
            description_tree_tagger[e.id] = tagger.tag_text(e.description, FILTER_TREE_TAGGER)
            len_description = len(description_tree_tagger[e.id])

        if e.website != '' and len_description < is_nb_word_website_enough(len_description):
            try:
                unique_urls = HashTableUrl()
                TreeNode(e.website.encode('utf-8'), DEFAULT_RECURSION_WEBSITE, unique_urls)
                websites[e.website] = ''
                for w in unique_urls.get_urls():
                    websites[e.website] += event_website_parser(w) + ' '

                event_analysis.add_document_in_corpus(websites[e.website], EventAnalysis.get_id_website(e.id, True))
                website_tree_tagger[e.id] = tagger.tag_text(websites[e.website], FILTER_TREE_TAGGER)
                #  We empty the buffer, to save memory and because we only need it afterwards the url
                websites[e.website] = ' '

            # Some website :
            # - has a 403 error, eg: complexe3d.com,
            # - is nonexistent website like http://www.biblio.morges.ch
            # - is not a web url ... like [email protected],
            # thhp://www.vitromusee.ch (the typo is on purpose !), www,chateaudeprangins.ch, http://
            except (HTTPError, URLError, ValueError) as e:  # We must know the other kind of error as conversion problem
                pass
def filter_english_tweets(df):
    #REMOVE NON LATIN LANGUAGES
    print 'Removing Non Latin Languages'
    temp = []
    for i,tweet in enumerate(df.tweet):
        try:
            if unicode(tweet,'utf8')==tweet:
                temp.append(True)
            else:
                temp.append(False)
        except ValueError:
            temp.append(False)
    
    #GUESS LANGUAGE
    print 'Guessing Language'
    data = df[temp]
    temp = []
    for x in data['tweet']:
        try:
            temp.append(guessLanguage(x)=='en')
        except Exception:
            temp.append(False)
    data = data[temp]
    data.index = range(data.shape[0])
    
    return data
def language(text):
    '''Uses guess-language. This is the speed bottleneck for the program
    so we might want to lose it. Also, it mistakes most English
    comments for other languages, so I'm not sure it's earning its
    keep. However, I haven't noticed it identifying things as English
    that weren't, so perhaps it should stay.'''
    return guess_language.guessLanguage(text)
Exemplo n.º 11
0
def _calculate_score(trend, entry):
    """Calculate a score for the given trend and feed entry. The current naive
    implementation works by determining the number of occurrences of the trend
    in the entry title and summary. A score of 0 indicates that the entry is
    not relevant to the trend.

    Args:
        trend: the trend to calculate for.
        entry: the feed entry to calculate a score for.
    """

    regex = re.compile(r'\b%s\b' % trend, re.IGNORECASE)
    count = len(regex.findall(entry.get('title', '')))
    count += len(regex.findall(entry.get('summary', '')))

    if count == 0:
        return 0

    # Filter out content that is not in English
    soup = BeautifulSoup(entry.get('summary', ''))
    summary = ''.join(soup.find_all(text=True))
    language = guess_language.guessLanguage(summary)
    if language != 'en':
        return 0

    return count
Exemplo n.º 12
0
def index(page=1):
    # user = {'nickname': 'Snow'}  # fake user
    #user = g.user  move to before_request
    form = PostForm()
    if form.validate_on_submit():
        language = guessLanguage(form.post.data)
        if language == 'UNKNOWN' or len(language) > 5:
            language = ''
        post = Post(body=form.post.data, timestamp=datetime.utcnow(), author=g.user, language=language)
        db.session.add(post)
        db.session.commit()
        flash('Your post is now live!')
        return redirect(url_for('index'))
    # posts = [  # fake blog
    #     {
    #         'author': {'nickname': u'律香川'},
    #         'body': u'蛋炒饭要么?'
    #     },
    #     {
    #         'author': {'nickname': u'郭大路'},
    #         'body': u'你太像个女孩了!'
    #     }
    # ]
    # posts = g.user.followed_posts().all()
    posts = g.user.followed_posts().paginate(page, POSTS_PER_PAGE, False)
    return render_template('index.html', title='Home', form=form, posts=posts)
Exemplo n.º 13
0
def user(nickname, page=1):
	form = PostForm()
	if form.validate_on_submit():
		language = guessLanguage(form.post.data)
		if language == 'UNKNOWN' or len(language) > 5:
			language = ''
		post = Post(body=form.post.data,
					timestamp=datetime.utcnow(),
					parent=g.user.key,
					author=g.user.key,
					language=language)
		post.put()
		flash(gettext('Your post is now live!'))
		return redirect(url_for('user', nickname=nickname))
	key = User.query(User.nickname == nickname)
	user = key.get()
	if user == None:
		flash(gettext('User %(nickname)s not found.', nickname = nickname))
		return redirect(url_for('index'))
	qry = Post.query(ancestor=user.key).order(-Post.timestamp)
	posts = qry.map(callback)
	return render_template('user.html',
							user=user,
							form=form,
							posts=posts)
Exemplo n.º 14
0
def upgrade(request):
    doc_list = request.db.docs
    for doc in doc_list.find():
        if "version" not in doc:
            doc["version"] = 1
            doc["created"] = datetime.utcnow()
        doc_list.save(doc)
    for doc in doc_list.find({"version": 1}):
        if "searchable_text" not in doc.keys():
            doc["version"] = 2
            doc_list.save(doc)
            continue
        searchable_text = doc.pop("searchable_text")
        lang = guessLanguage(searchable_text)
        search_terms = index(searchable_text + " " + doc["title"], [lang])
        doc["search_terms"] = search_terms
        doc["language"] = lang
        doc["version"] = 2
        doc_list.save(doc)
    for doc in doc_list.find({"version": 2}):
        doc["version"] = 3
        doc["search_terms"] = [x.lower() for x in doc["search_terms"]]
        doc_list.save(doc)
    for doc in doc_list.find({"version": 3}):
        doc["version"] = 4
        doc["keywords"] = []
        doc_list.save(doc)
    for doc in doc_list.find({"version": 4}):
        doc["version"] = 5
        doc["already_scanned"] = True
        doc_list.save(doc)
    return {"success": 1}
Exemplo n.º 15
0
def file_compare(infile):
    with open(infile) as fp:
        for line in fp:
            line = line.strip()
            lang = get_text(line)
            lang = guessLanguage(lang)
            print line, lang
Exemplo n.º 16
0
 def __getattr__(self, name):
     # handle and cache calculated properties
     if name not in self.__dict__ or not self.__dict__[name]:
         if name == 'raw':
             return self._getraw() # cached on fs
         if name == 'text':
             self.__dict__['text'] = self._gettext() # cached in extfields
         if name == 'tokens':
             self.__dict__['tokens'] = self._gettokens() # cached in extfields
         if name == 'stems':
             self.__dict__['stems'] = self._getstems() # cached in extfields
         if name == 'termcnt':
             self.__dict__['termcnt']=self._getstemcount()
         if name == 'tfidf':
             self.__dict__['tfidf']=self._gettfidf()
         if name == 'title':
             self.__dict__['title']=self.docid
         if name == 'frags':
             return self._getfrags() # not cached at all
         if name == 'lang' and 'lang' not in self.__dict__.keys():
             return guessLanguage(" ".join(self._gettext()))
         if name == 'body':
             return self._getbody() # not cached
         if name in self.metafields:
             return ''
     if name in self.__dict__.keys():
         return self.__dict__[name]
     else:
         raise AttributeError, name
Exemplo n.º 17
0
 def __init__(self,raw=None,docid=None,oid=None,d=None):
     if oid:
         # get by mongo oid
         d=Docs.find_one({"_id": oid})
     elif docid:
         # get by docid
         d=Docs.find_one({"docid": docid})
     if d:
         # load the values
         self.__dict__.update(d)
     elif raw:
         # create a new document
         self.__dict__.update({
             'docid' : docid,
             'pippies' : [],
             'pippiDocs' : [],
             'pippiDocsLen' : 0,
             'rawid' : None,
             })
         if not 'type' in self.__dict__:
             self.__dict__['type']='raw'
         if not 'metadata' in self.__dict__:
             self.__dict__['metadata']={}
         if raw:
             self.raw=raw
             self.lang=guessLanguage(" ".join(self.text))
         self.save()
     else:
         raise KeyError('empty docid')
Exemplo n.º 18
0
def isNotEnglish(desc):
	# Cyrillic characters
	if re.search(u'[\u0400-\u04FF]', desc): return True

	# Japanese characters
	if re.search(u'[\u3040-\u309F]', desc): return True
	if re.search(u'[\u30A0-\u30FF]', desc): return True
	if re.search(u'[\uFF00-\uFF9F]', desc): return True
	if re.search(u'[\u4E00-\u9FAF]', desc): return True

	# Chinese characters
	if re.search(u'[\u4E00-\u9FFF]', desc): return True
	if re.search(u'[\u3400-\u4DFF]', desc): return True
	if re.search(u'[\uF900-\uFAFF]', desc): return True

	# Korean characters
	if re.search(u'[\uAC00-\uD7AF]', desc): return True

	# Arabic characters
	if re.search(u'[\u0600-\u06FF]', desc): return True

	# Turkish characters
	if re.search(u'[ğüşöçİĞÜŞÖÇ]', desc): return True

	# Polish characters
	if re.search(u'[łśźżóń깣ŚŹŻÓŃĘĄ]', desc): return True

	# Use trigrams to detect language
	if not 'en' in lang.guessLanguage(desc): return True

	return False
Exemplo n.º 19
0
def index(page=1):
	form = PostForm()
	if form.validate_on_submit():
		language = guessLanguage(form.post.data)
		if language == 'UNKNOWN' or len(language) > 5:
			language = ''
		post = Post(body=form.post.data,
					timestamp=datetime.utcnow(),
					parent=g.user.key,
					author=g.user.key,
					language=language)
		post.put()
		flash(gettext('Your post is now live!'))
		return redirect(url_for('index'))
	#posts = g.user.followed_posts().paginate(page, POSTS_PER_PAGE, False)
	page,loc = serialize(Page)
	pages = {
           'name':  str(page.name),
		   'lat': loc.lat,
		   'lon': loc.lon
       }
	return render_template('index.html',
							title='Home',
							form=form,
							pages=pages)
Exemplo n.º 20
0
def get_features(text):
    """ Getting features (lpist of string) from text.
    """
    mapping = {
        'ja': get_japanese_features,
        'zh': get_japanese_features,  # guess_language sometimes mis-recognises ja as zh.
    }
    return mapping.get(guessLanguage(text), get_english_features)(text)
Exemplo n.º 21
0
def _desc(url, ie_key, title, info):
    desc_orig = desc = info.get('description', '').strip() or title
    desc = escape_wikitext(desc)
    if len(desc_orig) > 100:
        lang = guess_language.guessLanguage(desc_orig)
        if lang != 'UNKNOWN':
            desc = u'{{' + lang + u'|1=' + desc + u'}}'
    return desc
    def run(self):
        while True:
            blog_url = self.entry_queue.get()
            try:
                print "BlogPageScraper working on: " + str(blog_url)

                blog = urllib2.urlopen('http://'+blog_url).read()
                soup = BeautifulSoup(blog)
                text_list = soup.findAll(text=True)
                end =0
                start = 0
                for i in range(len(text_list)):
                    if text_list[i].find("Create an Account") != -1:
                        start = i
                    if text_list[i].find("Leave a comment") != -1:
                        end = i
                        break
                text = ''.join([text for text in text_list[start+1:end]])
                lang = guess_language.guessLanguage(text)
                print "language: " + str(lang)

                emotion = None
                for row in soup():
                    found_mood,emotion = self.recursive_mood_find(row,False,emotion)
                    if emotion != None:
                        print " I have found the emotion: "+ unicode(emotion).encode('utf8') + "\n For the page: " + unicode(blog_url).encode('utf8')
                        break

                if emotion != None and emotion != " " and emotion != "":
                    self.lock.acquire()
                    try:
                        fname = self.filename +"_"+ lang
                        f = codecs.open(fname,'a+',"utf-8")
                        f2 = codecs.open(fname + "_emotions.txt",'a+',"utf-8")
                        #print unicode(text).encode("utf8")
                        f.write("\n\n")
                        f.write("###BLOG_URL####" + blog_url.decode("utf8") + "#####")
                        f.write("\n")
                        f.write("#!#Emotion#!#"+emotion.decode("utf8")+ "#!#!#!#")
                        f.write("\n")
                        f.write(text)
                        f.close()

                        f2.write("\n\n")
                        f2.write(blog_url.decode("utf8"))
                        f2.write("\n")
                        f2.write(emotion.decode("utf8"))
                        f2.close()

                    except:
                        print  "failed to write from: "+ unicode(blog_url).encode("utf-8")
                    finally:
                        self.lock.release()
            except:
                "got an error scraping page:" + unicode(blog_url).encode("utf-8")
Exemplo n.º 23
0
def create_profile(sender, **kwargs):

    profile, new = UserProfile.objects.get_or_create(user=kwargs['instance'])
    
     #Figure out the correct format for the full name based on the language
    s = u'%s%s' % (profile.user.first_name, profile.user.last_name)
    lang = gl.guessLanguage(s)
    if lang == 'zh':
        profile.full_name = u'%s%s' % (profile.user.last_name, profile.user.first_name)
    else:
        profile.full_name = u'%s %s' % (profile.user.first_name, profile.user.last_name)
    profile.save()
    def POST(self):
        """Detects the language of an input string

        :returns: a json with an information about the language of the input string or an empty dict
        """

        response = {}
        text = self.request.POST.get('text')
        if text:
            response['language'] = guess_language.guessLanguage(text)

        return response
Exemplo n.º 25
0
def handle_tweet_pos_tagged(channel, method, properties, body):

    try:
        body_json = json.loads(body)
    except:
        body_json = body
        
    tweet = body_json['tweet']
    groups = body_json['groups']
        

    #return
        
    #if tweet_json.get('retweeted_status') is not None:
    #    return
#         text = tweet_json['text']
#         text = clean_tweet(text)
#         if 'http' in text:
#             return
#         if len(text) < 100:
#             return
    if not is_ascii(tweet):
        return
    result = guess_language.guessLanguage(tweet)
    if result != 'en':
        return

    new_groups = []
    remove_list = ['singapore']
    for group in groups:
        word_list = group.split()
        new_group = ' '.join([i for i in word_list if i.lower() not in remove_list])
        if new_group != '':
            new_groups.append(new_group)
        
    
        
    print(tweet)
    print(groups)
    print(new_groups)
    
    #tokenizer = Tokenizer(tweet_json['text'])
    #tokens = tokenizer.tokenize_as_tweet()
    #text_snippets = tokenizer.generate_candidate_strings(FILTER_KEYWORDS)
    place_finder = PlaceFinder()
    top_places = place_finder.match_text(new_groups, 1, 3)
    for place in top_places:
        print('TOP PLACE', place_finder.get_place(place[0])['name'], place[1])
    #urls = tokenizer.get_urls()
    #for url in urls:
    #    print(http_util.unshorten_url(url))
    print('')
Exemplo n.º 26
0
 def on_status(self, status):
     try: 
         text = status.text
         if guess_language.guessLanguage(text) == 'ja':
             status.created_at += timedelta(hours=9)
          
             print "-------------------"
             print "tweeted: " + str(status.created_at)
             print text + "\n"
             # col.insert({str(status.created_at): text})
     except Exception, e:
         print >> sys.stderr, 'Encountered ::', e
         pass
Exemplo n.º 27
0
    def grep(self, request, response):
        """
        Get the page indicated by the fuzzable_request and determine the language
        using the preposition list.

        :param request: The HTTP request object.
        :param response: The HTTP response object
        """
        with self._plugin_lock:
            if not self._exec:
                return

            if not response.is_text_or_html():
                return

            if is_404(response):
                return

            body = response.get_clear_text_body().lower()

            try:
                guessed_lang = guess_language.guessLanguage(body)
            except IndexError:
                # I don't care about exception handling of the external lib
                guessed_lang = 'UNKNOWN'

            if guessed_lang == 'UNKNOWN':
                # None means "I'm still trying"
                kb.kb.raw_write(self, 'lang', None)

                # Keep running until self._tries_left is zero
                self._tries_left -= 1

                if self._tries_left == 0:
                    msg = ('Could not determine the site language using the'
                           ' first 25 HTTP responses, not enough text to make'
                           ' a good analysis.')
                    om.out.debug(msg)

                    # unknown means I'll stop testing because I don't
                    # have any idea about the target's language
                    kb.kb.raw_write(self, 'lang', 'unknown')

                    self._exec = False
            else:
                # Only run until we find the page language
                self._exec = False

                msg = 'The page is written in: "%s".'
                om.out.information(msg % guessed_lang)
                kb.kb.raw_write(self, 'lang', guessed_lang)
Exemplo n.º 28
0
def recognize(filedata, accepted_languages, force_detection):
    with NamedTemporaryFile() as infile:
        infile.write(filedata)
        infile.file.flush()
        with NamedTemporaryFile() as textfile:
            retval = ocr(infile.name, textfile.name)
            img = imgopen(infile.name)
            if retval:
                detected_languages = []
                lang = "UNKNOWN"
            else:
                lang = guessLanguage(textfile.read().decode('utf-8'))
                detected_languages = [lang]
            final_filename = infile.name + '-rotated'
            try:
                for rotation in (180, 90, 180, 0):
                    if lang in accepted_languages:
                        textfile.seek(0)
                        return lang, img, textfile.read().decode('utf-8')
                    img = img.rotate(rotation)
                    img.save(final_filename, "JPEG")
                    retval = ocr(final_filename, textfile.name)
                    if retval:
                        continue
                    textfile.seek(0)
                    lang = guessLanguage(textfile.read().decode('utf-8'))
                    detected_languages.append(lang)
            finally:
                try:
                    remove(final_filename)
                except OSError:
                    pass
            if force_detection:
                raise TypeError("Languages %s not in range of accepted "
                                "languages %s" %
                                (str(detected_languages),
                                 str(accepted_languages)))
            return lang, img, textfile.read().decode('utf-8')
Exemplo n.º 29
0
def guess_language(text):  # pragma: no cover
    """Guess the language in which a body of text is written.

    This uses the external guess-language python module, and will fail and return
    Language(Undetermined) if it is not installed.
    """
    try:
        from guess_language import guessLanguage
        return babelfish.Language.fromguessit(guessLanguage(text))

    except ImportError:
        log.error('Cannot detect the language of the given text body, missing dependency: guess-language')
        log.error('Please install it from PyPI, by doing eg: pip install guess-language')
        return UNDETERMINED
Exemplo n.º 30
0
def index(page=1):
    user = g.user
    form = PostForm()
    if form.validate_on_submit():
        language = guessLanguage(form.post.data)
        if language == "UNKNOWN" or len(language) > 5:
            language = ""
        post = Post(body=form.post.data, timestamp=datetime.utcnow(), author=g.user, language=language)
        db.session.add(post)
        db.session.commit()
        flash(gettext("Your post is now live!"))
        return redirect(url_for("index"))
    posts = g.user.followed_posts().paginate(page, POSTS_PER_PAGE, False)

    title = "All the news unfit to print"
    return render_template("index.html", user=user, posts=posts, form=form)
Exemplo n.º 31
0
    words.append(word)

for word in words:
    if word.istitle() == True:
        title = True
        word = word.lower()
    else:
        title = False
    splitword = re.findall(r"[\w']+|[.,!?;—]", word)
    word = splitword[0]
    if len(splitword) >= 2:
        p = True
        punc = splitword[1]
    else:
        p = False
    if guessLanguage(word) == language:
        # Worst way to find a match ever...
        translation = conn.cursor().execute(
            'select ' + tl + ' from \'' + tablename + '\' where ' + language +
            '  = ? limit 1', (word, )).fetchone()
        if translation == None:
            translation = conn.cursor().execute(
                'select ' + tl + ' from \'' + tablename + '\' where ' +
                language + '  like ? limit 1',
                ('%' + word + '%', )).fetchone()
            if translation == None:
                translation = conn.cursor().execute(
                    'select ' + tl + ' from \'' + tablename + '\' where ' +
                    language + '  like ? limit 1',
                    ('%' + word[:-2] + '%', )).fetchone()
                if translation == None:
Exemplo n.º 32
0
def site_index(request):
    # preliminary site index page

    # TODO, possibly -- might be worth supporting HEAD requests
    # since this is the site in

    if request.method == 'GET':
        # on get request, initialize an empty form for display
        form = InputForm()

    elif request.method == 'POST':
        # on post, init form based on posted data
        # if form is invalid, redisplay input form with error messages

        form = InputForm(request.POST)
        if form.is_valid():

            # actual logic here - infer search terms, query apis, display stuff

            text = form.cleaned_data['text']
            zotero_user = form.cleaned_data['zotero_user']

            search_terms = {}
            if zotero_user:
                request.session['username'] = zotero_user
                return HttpResponseRedirect(
                    zotero.oauth_authorize_url(request))

            elif text:
                lang = guess_language.guessLanguage(text)
                logger.debug('language detected as %s' % lang)
                common_terms = common_words(text, 15, lang)
                dbpedia_terms = get_search_terms(text, lang)

                # too many terms? phrase? didn't get results when combining
                # TODO: combine dbpedia + common terms; randomize from dbpedia results
                #search_terms['keywords'].extend(dbpedia_terms['keywords'])

                search_terms['keywords'] = list(dbpedia_terms['keywords'])[:10]

                # if no terms found in dbpedia, use common terms instead
                # (todo: should be some kind of combination)
                if not search_terms['keywords']:
                    search_terms['keywords'] = common_terms['keywords']

                # within dbpedia_terms there are now lists for
                # people
                # places
                # dates {'early': ,'late': }
                # people and places were reconciled against DBpedia. Dates contains
                # only four digit values and could be passed to

            # if for is valid,
            # for either text input or zotero where we got terms

            # print search_terms['keywords']
            # store search terms in the session so we can redirect
            request.session['search_terms'] = search_terms

            # insert logic for processing zotero username here
            # zotero_user = form.cleaned_data['zotero_user']

            # redirect
            # NOTE: should probably be http code 303, see other
            return HttpResponseRedirect(reverse('discoveries:view'))

        # if not valid: pass through and redisplay errors

    return render(request, 'core/site_index.html', {'input_form': form})
Exemplo n.º 33
0
doc_id = sys.argv[1]
docs = Document.objects.filter(doc_id=doc_id)
if len(docs) == 0:
    print "Document %s not found" % doc_id
    sys.exit(1)

doc = docs[0]
asset_path = os.path.join(ORIGINAL_MEDIA_PATH, "%s" % doc.docfile)
txt_path = get_txt_path(doc_id)
type_of_file = get_type_of_file(asset_path)

error_string = ''
if type_of_file:
    converted = convert_file_to_txt(asset_path, txt_path, type_of_file)
    language = guess_language.guessLanguage(file(txt_path).read())
    if not converted:
        error_string = ERROR_CONVERSION_ERROR
    if converted:
        generate_word_list(txt_path, language)
else:
    converted = False
    error_string = ERROR_UNKNOWN_TYPE_OF_FILE

doc.converted = converted
doc.format_type = type_of_file
doc.language = language
doc.size = os.stat(asset_path).st_size
doc.txtfile = txt_path

print 'language =', doc.language
Exemplo n.º 34
0
    def test_guess(self):
        tests = [
            ("This is a test of the language checker", "en"),
            ("Verifions que le détecteur de langues marche", "fr"),
            ("Sprawdźmy, czy odgadywacz języków pracuje", "pl"),
            ("авай проверить  узнает ли наш угадатель русски язык", "ru"),
            ("La respuesta de los acreedores a la oferta argentina para salir del default no ha sido muy positiv",
             "es"),
            ("Сайлау нәтижесінде дауыстардың басым бөлігін ел премьер министрі Виктор Янукович пен оның қарсыласы, оппозиция жетекшісі Виктор Ющенко алды.",
             "kk"),  # Kazakh
            ("милиция ва уч солиқ идораси ходимлари яраланган. Шаҳарда хавфсизлик чоралари кучайтирилган.",
             "uz"),  # uzbek
            ("көрбөгөндөй элдик толкундоо болуп, Кокон шаарынын көчөлөрүндө бир нече миң киши нааразылык билдирди.",
             "ky"),  # kyrgyz
            ("yakın tarihin en çekişmeli başkanlık seçiminde oy verme işlemi sürerken, katılımda rekor bekleniyor.",
             "tr"),
            ("Daxil olan xəbərlərdə deyilir ki, 6 nəfər Bağdadın mərkəzində yerləşən Təhsil Nazirliyinin binası yaxınlığında baş vermiş partlayış zamanı həlak olub.",
             "az"),  # Azerbaijani
            (" ملايين الناخبين الأمريكيين يدلون بأصواتهم وسط إقبال قياسي على انتخابات هي الأشد تنافسا منذ عقود",
             "ar"),
            ("Американське суспільство, поділене суперечностями, збирається взяти активну участь у голосуванні",
             "uk"),  # ukrainian
            ("Francouzský ministr financí zmírnil výhrady vůči nízkým firemním daním v nových členských státech EU",
             "cs"),  # czech
            ("biće prilično izjednačena, sugerišu najnovije ankete. Oba kandidata tvrde da su sposobni da dobiju rat protiv terorizma",
             "hr"),  # croatian
            (" е готов да даде гаранции, че няма да прави ядрено оръжие, ако му се разреши мирна атомна програма",
             "bg"),  # bulgarian
            ("на јавното мислење покажуваат дека трката е толку тесна, што се очекува двајцата соперници да ја прекршат традицијата и да се појават и на самиот изборен ден.",
             "mk"),  # macedonian
            ("în acest sens aparţinînd Adunării Generale a organizaţiei, în ciuda faptului că mai multe dintre solicitările organizaţiei privind organizarea scrutinului nu au fost soluţionate",
             "ro"),  # romanian
            ("kaluan ditën e fundit të fushatës në shtetet kryesore për të siguruar sa më shumë votues.",
             "sq"),  # albanian
            ("αναμένεται να σπάσουν παράδοση δεκαετιών και να συνεχίσουν την εκστρατεία τους ακόμη και τη μέρα των εκλογών",
             "el"),  # greek
            (" 美国各州选民今天开始正式投票。据信,", "zh"),  # chinese
            (" Die kritiek was volgens hem bitter hard nodig, omdat Nederland binnen een paar jaar in een soort Belfast zou dreigen te veranderen",
             "nl"),  # dutch
            ("På denne side bringer vi billeder fra de mange forskellige forberedelser til arrangementet, efterhånden som vi får dem ",
             "da"),  # danish
            ("Vi säger att Frälsningen är en gåva till alla, fritt och för intet.  Men som vi nämnt så finns det två villkor som måste",
             "sv"),  # swedish
            ("Nominasjonskomiteen i Akershus KrF har skviset ut Einar Holstad fra stortingslisten. Ytre Enebakk-mannen har plass p Stortinget s lenge Valgerd Svarstad Haugland sitter i",
             "nb"),  # norwegian
            ("on julkishallinnon verkkopalveluiden yhteinen osoite. Kansalaisten arkielämää helpottavaa tietoa on koottu eri aihealueisiin",
             "fi"),  # finnish
            ("Ennetamaks reisil ebameeldivaid vahejuhtumeid vii end kurssi reisidokumentide ja viisade reeglitega ning muu praktilise informatsiooniga",
             "et"),  # estonian
            ("Hiába jön létre az önkéntes magyar haderő, hiába nem lesz többé bevonulás, változatlanul fennmarad a hadkötelezettség intézménye",
             "hu"),  # hungarian
            ("հարաբերական", "hy"),  # armenian
            ("Hai vấn đề khó chịu với màn hình thường gặp nhất khi bạn dùng laptop là vết trầy xước và điểm chết. Sau đây là vài cách xử lý chú",
             "vi"),
            ("ii", UNKNOWN),

            # This text has a mix of Hirigana, Katakana and CJK which requires the fix for issue:3 to classify correctly
            ("トヨタ自動車、フィリピンの植林活動で第三者認証取得 トヨタ自動車(株)(以下、トヨタ)は、2007年9月よりフィリピンのルソン島北部に位置するカガヤン州ペニャブラン",
             'ja'),
        ]

        for text, name in tests:
            self.assertEquals(name, guessLanguage(text))

        text = "Verifions que le détecteur de langues marche"
        self.assertEquals('fr', guessLanguageTag(text))
        self.assertEquals('French', guessLanguageName(text))
        self.assertEquals(26150, guessLanguageId(text))
        self.assertEquals(('fr', 26150, 'French'), guessLanguageInfo(text))
Exemplo n.º 35
0
def fetch_usr_tips(user_id):
    success = 0
    retry = 0
    content = ''
    while success == 0:
        try:
            super_token = 'QEJ4AQPTMMNB413HGNZ5YDMJSHTOHZHMLZCAQCCLXIX41OMP'
            fetch_url_str = 'https://api.foursquare.com/v2/users/' + str(user_id) + '/tips?oauth_token='+super_token + \
                            '&limit=5000&v=20141231'
            content = get_raw_info(fetch_url_str)
            if content != -1 and content != -2:
                success = 1
        except:
            time.sleep(3)
            retry += 1
            if retry == AUTO_RECONNECT_TIMES:
                return -2
    output_dict = {}
    content_json = json.loads(content)
    output_dict['tips content'] = []
    a = {}
    if content_json['meta']['code'] != 200:
        output_dict['error_meta'] = str(content_json['meta']['code'])
        if str(content_json['meta']['errorDetail']) == "Must provide a valid user ID or 'self.'":
            output_dict['user existence'] = '-1'
        return output_dict

    output_dict['count'] = content_json['response']['tips']['count']
    for item in (content_json['response']['tips']['items']):
        if 'cc' in item['venue']['location']:
            venue_country = item['venue']['location']['cc']
        else:
            venue_country = '-'
        a = {}
        a['len'] = len(item['text'])
        a['text'] = item['text'].encode('utf-8')
        a['venue name'] = item['venue']['name'].encode('utf-8')
        a['timespam'] = str(item['createdAt'])
        a['venue country'] = venue_country

        if 'photo' in item:
            a['photo'] = "y "
        else:
            a['photo'] = "n "
        cate_info = item['venue']['categories']
        if len(cate_info) > 0:
            for xx in cate_info:
                a['category'] = get_venue_category(xx['name'])
        else:
            a['category'] = '-'

        tip_text = a['text']
        tip_language = guess_language.guessLanguage(tip_text)
        if tip_language == 'en':
            testimonial = TextBlob(tip_text)
            polarity = testimonial.sentiment.polarity
            a['polarity'] = polarity
        else:
            a['polarity'] = '-'
        output_dict['tips content'].append(a)
    return output_dict
Exemplo n.º 36
0
#!/usr/bin/env python

# Takes an input and prints only the language specified
# Usage: python print-lang.py [filename] [lang-code]
# i.e., python print-lang.py 04.md ru or python print-lang.py 04.md en

import sys
import enchant
from guess_language import guessLanguage

input_file = sys.argv[1]
lang = sys.argv[2]
output_file = (input_file.rsplit(".", 1)[0]) + "_" + lang + ".md"

myfile = open(input_file, 'r')
output_file = open(output_file, 'w')

for line in myfile:
    if guessLanguage(line) == lang:
        output_file.write(line + "\n")
    else:
        pass
Exemplo n.º 37
0
def main(argv=None):
    """ Main function """
    if argv is None:
        argv = sys.argv

    points = []
    labels = []
    ru = []
    en = []
    uk = []
    pl = []
    ru_labels = []
    en_labels = []
    uk_labels = []
    pl_labels = []
    points_labels = []
    title_labels = dict()
    lang_labels = dict()
    type_labels = dict()

    type_file = open('../youTubeData/video_type', "r")
    type_line = type_file.readline()
    while not type_line == "":
        type_labels[type_line.split(";")[0]] = type_line.split(";")[1].strip()
        type_line = type_file.readline()

    lang_file = open('../youTubeData/manually_recognized', "r")
    lang_line = lang_file.readline()
    while not lang_line == "":
        lang_labels[lang_line.split(";")[0]] = lang_line.split(";")[1].strip()
        lang_line = lang_file.readline()

    title_file = open("../youTubeData/all_frames_stats_title", "r")
    line = " "
    manual = 0
    while not line == "":
        line = title_file.readline()
        lbl = line.strip()
        # .lower().replace("stepan bandera","en")
        title = title_file.readline()
        title_labels[lbl] = title
        title_lang = guess_language.guessLanguage(strip_tags(title))
        desc = title_file.readline()

        try:
            desc.split("No description available")[1]
            desc = ""
        except IndexError:
            pass
        desc_lang = guess_language.guessLanguage(strip_tags(desc))
        line = title_file.readline()
        lang = guess_language.guessLanguage(
            strip_tags(title) + strip_tags(desc))

        if lbl in lang_labels:
            print lbl, " found"
            continue

        print lbl, " not found"
        if lang in ['uk', 'ru', 'pl', 'en']:
            lang_labels[lbl] = lang
        else:
            manual += 1
            print "------------------------------------------------------------"
            print title_lang, desc_lang
            print title
            print desc
            print "------------------------------------------------------------"
            l = raw_input("which language? ")
            lang_labels[lbl] = l

        print manual, " manually recognized"

    lang_file.close()
    lang_file_content = ""
    print lang_labels
    for key in lang_labels.keys():
        lang_file_content += key + ";" + lang_labels[key] + "\r\n"

    print lang_file_content
    lang_file = open('../youTubeData/manually_recognized', "w")
    lang_file.write(lang_file_content)
    lang_file.close()

    text_file = open('../youTubeData/all_frames_stats', "r")
    line = " "
    counter = 0
    while not line == "":
        line = text_file.readline()
        counter = counter + 1
        try:
            values = map(float, line.split('\r\n')[0].split(';')[1:])
            label = line.split(';')[0]
            labels.append(label)
            size, min_max, mean, variance, skew, kurt = scipy.stats.describe(
                values)
            # throw out vids under 20 seconds
            if size < 20:
                print "video too short: ", label
                continue

            # cut min variance at 0.001, otherwise the plot gets quite
            # distorted
            if variance < 0.001:
                variance = 0.001

        except ValueError:
            print "error calculating stats for ", label
            continue

        point = [mean, variance]

        try:
            if lang_labels[label] == "uk":
                uk.append(point)
                uk_labels.append(label)
            elif lang_labels[label] == "ru":
                ru.append(point)
                ru_labels.append(label)
            elif lang_labels[label] == "en":
                en.append(point)
                en_labels.append(label)
            elif lang_labels[label] == "pl":
                pl.append(point)
                pl_labels.append(label)
            else:
                points.append(point)
                points_labels.append(label)
        except:
            points.append(point)
            points_labels.append(label)

    print counter, " lines read."
    print "number of labels ", str(len(labels))
    print "number of labels ", str(len(lang))

    pylab.show()
    pylab.xlabel('Mean')
    pylab.ylabel('Variance')
    # pylab.yscale("log")
    pylab.title("Frame Likenesses of Bandera Youtube Clips")

    pylab.plot(*zip(*points), marker='o', color='w', ls='')
    pylab.plot(*zip(*uk), marker='o', color='#ff8000', ls='')
    pylab.plot(*zip(*ru), marker='o', color='#b40404', ls='')
    pylab.plot(*zip(*pl), marker='o', color='#66FF00', ls='')
    pylab.plot(*zip(*en), marker='o', color='#819FF7', ls='')

    figure = pylab.gcf()
    figure.set_size_inches(figure.get_size_inches()[0] * 2,
                           figure.get_size_inches()[1] * 2)
    figure.savefig('video_langs.png', bbox_inches='tight')
Exemplo n.º 38
0
def detect_language(content):
    # import is inside the function, because it takes noticeable time
    import guess_language

    return guess_language.guessLanguage(content)
Exemplo n.º 39
0
def tuling_reply(msg):  # the main function used for message processing
    #print(msg)
    #print(msg["User"].split(",")[0].split(":")[2])
    #(msg["FromUserName"]=="@d57c5b7f0fff1374fa5b38594ec49362")
    # 为了保证在图灵Key出现问题的时候仍旧可以回复,这里设置一个默认回复
    # 如果图灵Key出现问题,那么reply将会是None
    # a or b的意思是,如果a有内容,那么返回a,否则返回b
    # 有内容一般就是指非空或者非None,你可以用`if a: print('True')`来测试

    lang = guessLanguage(
        msg["Text"])  # guess the language type and give different response

    sender_alias = msg['User']['Alias'].replace(
        "_", ""
    )  # the unique id of senders(wechat users), attention: the FromUserId or something is not unique, which changes nexttime you log in.
    sender_city = msg['User']['Province'] + u'省' + msg['User']['City'] + u'市'
    sender_nickName = msg['User']['NickName']
    if lang == 'en':  # is the message type is english
        first_greetings_EN = u"Hello, I'm xiaobo belonging to Zhenbo Xu, may I help you? If it's a emergency, please contact my owner by phone call or SMS. I can chat with you if you want, send <start> to begin and send <stop> to shut me down."
        defaultReply_EN = u'OMG, this question stumped me. Could you please wait for my master?'
        if msg["Text"] == "start":  # the sender start the service
            if sender_alias not in replylist:
                replylist.append(
                    sender_alias)  # add sender alias to identify the sender
                write_replyDB()  #write to the file
                if sender_alias not in open_reminder:
                    open_reminder.append(sender_alias)
                    write_reminderDB()
            return u'xiaobo auto reply started'  # notification send back to the sender
        elif msg["Text"] == "stop":  # stop the service
            try:
                replylist.remove(sender_alias)
            except:
                pass
            write_replyDB()
            return u'xiaobo auto reply stopped'
        if sender_alias in replylist:  # if the user in the list of who started the service, then process it
            special_reply = special_function(msg["Text"])
            if special_reply:
                return special_reply
            return get_response(
                msg['Text']) or defaultReply_EN  # .decode('unicode-escape')
        else:  # to find whether it needs to notify him/her the exist of xiaobo
            if sender_alias not in open_reminder:
                return first_greetings_EN
            else:
                pass
    else:  #默认为中文
        first_greetings_CN = sender_nickName + u'你好啊。嘿嘿,我是徐振博家的小机器人小博,有什么可以帮助您的么.有急事请通过短信或者电话联系主人。我可以和您聊天,发送<开始>两个字就可以啦,发送<关闭>可以把烦人的我关掉。现在我会自动谷歌翻译,查天气,歇后语,查邮编查公交>等等。另外,如果有个功能您特别想要,可以联系我主人实现一下'
        defaultReply_CN = u'那个,这个机器人不会回答这个问题,不过他会谷歌翻译(回复<谷歌>查看详细信息),输地名查天气,查单词,歇后语,查邮编查公交等等。另外,如果有个功能您特别想要,可以联系我,我有空实现一下'
        if msg["Text"] == u'开始':
            if sender_alias not in replylist:
                replylist.append(sender_alias)
                write_replyDB()
                if sender_alias not in open_reminder:
                    open_reminder.append(sender_alias)
                    write_reminderDB()
            return u'小博自动回复已开启'
        elif msg["Text"] == u'关闭':
            try:
                replylist.remove(sender_alias)
            except:
                pass
            write_replyDB()
            return u'小博自动回复已关闭'
        if sender_alias in replylist:
            annoy_to_close = u' '  # to 提示小博可以被关闭
            if len(
                    msg["Text"]
            ) < 10:  # if short than 20, then do feature identify, and give detailed query way.
                if u'关' in msg["Text"] or u'烦' in msg["Text"]:
                    annoy_to_close = u'\n如果觉得我很烦,可以发送<关闭>二字关掉我哦-.- \n'
            special_reply = special_function(msg["Text"])
            if special_reply:
                return special_reply
            tt = get_response(msg['Text'], sender_alias, sender_city)
            return tt + annoy_to_close or defaultReply_CN + annoy_to_close  # .decode('unicode-escape')
        else:
            if sender_alias not in open_reminder:
                open_reminder.append(sender_alias)
                write_reminderDB()
                return first_greetings_CN
            elif u'小博' in msg["Text"]:
                return u'miss me? 你知道要发送"开始"两个字让我主人把我打开,对吧?'
            else:
                pass
Exemplo n.º 40
0
                try:
                    extractor = Extractor(extractor='ArticleExtractor',
                                          url=url)
                    extracted_text = extractor.getText()

                    #skip text if size is less than threshold
                    if len(extracted_text) < 500:
                        print(
                            "\n***SIZE IS TOO SMALL, TEXT IS EXCLUDED!!! (in main.py, extractor)\n"
                        )
                        # sleep for 4 seconds before trying crawling agian, otherwise you will be identified and blocked
                        time.sleep(4)
                        continue

                    #skip text if the language is not same as requested
                    lang_tmp = guessLanguage(extracted_text).encode('utf-8')
                    if lang_tmp != args.lang:
                        print(
                            "\n***WRONG LANGUAGE!!! (in main.py, guessLanguage)"
                        )
                        # sleep for 4 seconds before trying crawling agian, otherwise you will be identified and blocked
                        time.sleep(4)
                        continue
                except:
                    #if 'request timeout' happens go to the next URL
                    e = sys.exc_info()[0]
                    print("\n***ERROR (in main.py, extractor 2): " + str(e))
                    # sleep for 4 seconds before trying crawling agian, otherwise you will be identified and blocked
                    time.sleep(4)
                    continue
                #article_id += 1
Exemplo n.º 41
0
#!/usr/bin/env python

from guess_language import guessLanguage

enguess = guessLanguage('hi')

if str(enguess) == 'en':
    print("PASS")
else:
    print("FAIL")