Пример #1
0
    def test_guess(self):
        for text, name in self.tests:
            self.assertEqual(guess_language(text), name)

        for text, name, hints in self.tests_with_hints:
            self.assertEqual(guess_language(text, hints), name)

        text = "Vérifions que le détecteur de langue fonctionne."
        self.assertEqual(guess_language_tag(text), "fr")
        self.assertEqual(guess_language_name(text), "French")
        self.assertEqual(guess_language_id(text), 26150)
        self.assertEqual(guess_language_info(text), ("fr", 26150, "French"))
Пример #2
0
 def parse(self, **kwargs):
     try:
         path_file = kwargs.get('p', None)
         MASTER_MD_DICT = kwargs.get('md', None)
         multiline = kwargs.get('m', None)
         is_debug = kwargs.get('is_debug', None)
         stay_offline = kwargs.get('xo', None)
         data_dict = {'mainfile': path_file,
                     'depends': []
                      }
         try:
             with open(path_file, encoding='utf-8') as input_file:
                 content = input_file.read()
                 if multiline:
                     # reset key; try guess lang:
                     data_dict['paperLanguage'] = []
                     t = re.search(r'([\w\d\s\.\,\:]{300,1200})', content, flags=re.DOTALL)
                     if t:
                         if guess_language(t.group(1)) is not None:
                             data_dict['paperLanguage'].append(guess_language(t.group(1)))
                         else:
                             data_dict['paperLanguage'] = []
                     # process rules
                     for rule in rule_set_rmd_multiline:
                         this_rule = rule.split('\t')
                         s = re.search(this_rule[1], content, flags=re.DOTALL)
                         if s:
                             if this_rule[0].startswith('yaml'):
                                 from parsers.parse_yaml import ParseYaml
                                 parsed = ParseYaml().internal_parse(s.group(1), MASTER_MD_DICT, stay_offline, is_debug)
                                 if parsed == 'error':
                                     return parsed
                                 else:
                                     data_dict.update(parsed)
                             if this_rule[0].startswith('rblock'):
                                 data_dict = parse_r(s.group(1), data_dict)
                 else:
                     # parse entire file as one code block
                     data_dict.update(r_codeblock=parse_r(content, data_dict))
         except UnicodeDecodeError:
             status_note(['! error, failed to decode <', md_file, '>'], d=is_debug)
             return 'error'
         # save to list of extracted metadata:
         data_dict['provenance'] = get_prov(path_file)
         return data_dict
         # save or output results
         # todo: reenable that option:
         #if metafiles_all:
         #    output_extraction(data_dict, out_format, out_mode, path_file)
     except Exception as exc:
         status_note('! error while extracting Rmd', d=is_debug)
         return 'error'
Пример #3
0
 def _to_db_row(fields):
     source_text = fields.get('sourceText', "").strip()
     target_text = fields.get('targetText', "").strip()
     res = {}
     if source_text:
         res.update({
             'source_text': source_text,
             'source_lang': guess_language(source_text),
         })
     if target_text:
         res.update({
             'target_text': target_text,
             'target_lang': guess_language(target_text),
         })
     return res
Пример #4
0
def get_language_tool_results(filename, file_contents, locale):
    joined_text = "".join(file_contents)
    locale = guess_language(joined_text) if locale == 'auto' else locale
    locale = 'en-US' if not locale else locale

    tool = LanguageTool(locale)
    matches = tool.check(joined_text)
    for match in matches:
        if not match.replacements:
            diffs = None
        else:
            replaced = correct(joined_text, [match]).splitlines(True)
            diffs = {filename:
                     Diff.from_string_arrays(file_contents, replaced)}

        rule_id = match.ruleId
        if match.subId is not None:
            rule_id += '[{}]'.format(match.subId)

        message = match.msg + ' (' + rule_id + ')'
        yield message, diffs, SourceRange.from_values(filename,
                                                      match.fromy+1,
                                                      match.fromx+1,
                                                      match.toy+1,
                                                      match.tox+1)
Пример #5
0
def urldata(url):
    print('Opening page...', end=' ')
    req = urllib.request.Request(url, headers={'User-Agent': 'Mozilla/5.0'})
    html = urllib.request.urlopen(req).read()
    print('Done')

    print('Parsing...', end=' ')
    soup = BeautifulSoup(html, 'html.parser')
    print('Done')
    # print(soup.prettify())

    print('Finding all text...', end=' ')
    text_findAll = soup.findAll(text=True)
    print('Done')

    print('Filtering visible texts...', end=' ')
    visible_texts = []
    for line in filter(visible, text_findAll):
        line = line.strip()
        if line:
            visible_texts.append(line)
    print('Done')

    print('Filtering arabic texts...', end=' ')
    arabic = []
    for l in visible_texts:
        if guess_language.guess_language(l) == 'ar':
            arabic.append(l)
    print('Done')
    return arabic
Пример #6
0
def find_out_language(candidate_languages, *args):
    candidates = []
    for sample in args:
        candidate = guess_language(sample)
        if candidate != UNKNOWN_LANGUAGE and candidate in candidate_languages:
            candidates.append(candidate)
        try:
            for candidate in detect_langs(sample):
                if candidate.lang in candidate_languages:
                    candidates.append(candidate.lang)
        except LangDetectException:
            continue

    if len(candidates) == 0:
        return None
    leading_candidate = {
        'lang': candidates[0],
        'count': candidates.count(candidates[0])
    }
    for leading_candidate in candidates[1:0]:
        if leading_candidate['count'] < candidates.count(candidate):
            leading_candidate['lang'] = candidate
            leading_candidate['size'] = candidates.count(candidate)
    if leading_candidate['lang'] == UNKNOWN_LANGUAGE:
        return None
    return leading_candidate['lang']
Пример #7
0
def fetch_word():


	Tweet = db.Tweet
	session = db.session

	tweets = session.query(Tweet).limit(1000)

	all_words = ''

	for tweet in tweets:

		if tweet.text:
			#print tweet.text
			# Initializing final dictionary {tweet:tfidf total} & {tweet:# of words}
			tweet_tfidf_dict[tweet.text] = None
			tweet_wordcount_dict[tweet.text] = None
			if guess_language(tweet.text) == 'en':
				tweet_text = str(tweet.text.encode('utf-8'))
				tweet_lower = tweet_text.lower()
				tweet_final = tweet_lower.translate(None, string.punctuation)
			
			# Concatenate all words (including hashtags) in tweet
				all_words = all_words + ' ' + tweet_final

	return all_words
def main():
    rootdirname = sys.argv[1]
    #read all the files in the directory with .txt extension
    print(rootdirname)
    listfiles =  recursive_glob(rootdirname,"*.txt") 
    if len(listfiles) == 0:
        print ("No files present")
        sys.exit(1)

    i = 0
    randfileslist = random.sample(listfiles,25)
    for file in randfileslist:
        print ("Current file is :",file)
        f = open(file, "r")
        lines = f.readlines()
        txtfile = open('tweetdata.txt',mode ='at')
        for line in lines:	
            try:
                tweet = simplejson.loads(line)
                if "text" in tweet:
                    text = str(tweet["text"].encode('utf-8','ignore'))
                    language = guess_language(text)
                    if language == 'en':
                        i = i +1
                        if i == 501:
                            i = 0
                            break
                        txtfile.write(line)
            except ValueError:
                pass
Пример #9
0
    def run(self,
            filename,
            file,
            natural_language: str = 'auto',
            languagetool_disable_rules: typed_list(str) = (),
            ):
        """
        Checks the code with LanguageTool.

        :param natural_language:           A locale representing the language
                                           you want to have checked. If set to
                                           'auto' the language is guessed.
                                           If the language cannot be guessed or
                                           an unsupported language is guessed,
                                           'en-US' is used.
        :param languagetool_disable_rules: List of rules to disable checks for.
        """
        # Defer import so the check_prerequisites can be run without
        # language_check being there.
        from language_check import LanguageTool, correct

        joined_text = ''.join(file)
        natural_language = (guess_language(joined_text)
                            if natural_language == 'auto'
                            else natural_language)

        try:
            tool = LanguageTool(natural_language, motherTongue='en_US')
        except ValueError:
            # Using 'en-US' if guessed language is not supported
            logging.warn(
                "Changing the `natural_language` setting to 'en-US' as "
                '`language_check` failed to guess a valid language.'
            )
            natural_language = 'en-US'
            tool = LanguageTool(natural_language, motherTongue='en_US')

        tool.disabled.update(languagetool_disable_rules)
        matches = tool.check(joined_text)
        for match in matches:
            if not match.replacements:
                diffs = None
            else:
                replaced = correct(joined_text, [match]).splitlines(True)
                diffs = {filename:
                         Diff.from_string_arrays(file, replaced)}

            rule_id = match.ruleId
            if match.subId is not None:
                rule_id += '[{}]'.format(match.subId)

            message = match.msg + ' (' + rule_id + ')'
            source_range = SourceRange.from_values(filename,
                                                   match.fromy+1,
                                                   match.fromx+1,
                                                   match.toy+1,
                                                   match.tox+1)
            yield Result(self, message, diffs=diffs,
                         affected_code=(source_range,))
Пример #10
0
def getFeedLanguage(feed, debug=False):
    print "Feed:", feed.feed.title

    langCount = defaultdict(int)
    guessed_lang = guess_language(feed.feed.title)  # .encode('utf-8')
    langCount[guessed_lang] += 2
    for entry in feed.entries[:10]:

        if hasattr(entry, 'title'):
            txtValue = entry.title
            # print entry.title

            soup = getSoupParser(txtValue)
            txtValue = ' '.join(soup.findAll(text=True))
            if debug:
                print "entry-title:", txtValue
            guessed_lang = guess_language(txtValue)
            langCount[guessed_lang] += 1

        # print entry
        if hasattr(entry, 'summary'):
            txtValue = entry.summary
            soup = getSoupParser(txtValue)
            txtValue = ' '.join(soup.findAll(text=True))
            if debug:
                print "entry-summary:", txtValue
            guessed_lang = guess_language(txtValue)
            langCount[guessed_lang] += 1

        # print entry.content[0]
        if hasattr(entry, 'content'):
            allContents = ' '.join([content.value for content in entry.content])
            print allContents
            soup = getSoupParser(txtValue)
            txtValue = ' '.join(soup.findAll(text=True))
            if debug:
                print "entry-allcontents:", txtValue
            guessed_lang = guess_language(txtValue)
            langCount[guessed_lang] += 1

    langCount['UNKNOWN'] = 0

    if debug:
        print "langCount:", langCount
    key, value = max(langCount.iteritems(), key=lambda x: x[1])
    return key
Пример #11
0
        def test_guess_enchant(self):
            languages = enchant.list_languages()

            for text, name in self.enchant_tests:
                if any(language.startswith(name) for language in languages):
                    self.assertEqual(guess_language(text), name)
                else:
                    warnings.warn("no spelling dictionary for language {!r}"
                                  .format(name))
Пример #12
0
def lang_percentage(statusi, language="mk"):
    """ Vrakja procent na tvitovi koi se na odredeniot jazik.
    """
    n = 0
    for status in statusi:
        if guess_language(unicode(status.text)) == language:
            n = n + 1
    if len(statusi) == 0:
        return 0
    return n / float(len(statusi))
Пример #13
0
	def on_success(self, status_data):		
		# check if the message has any text		
		if 'text' in status_data:
			valid = True
			if self.lang_filter:
				inferred_lang = guess_language(status_data['text'])
				valid = True if (inferred_lang == self.lang_filter or inferred_lang == UNKNOWN) else False				
			
			if valid:
				self.queue.put(status_data, False)
 def filterEnglishTweets(self, filePathToFilter):
     lines = []
     newLines = []
     with open(filePathToFilter, 'r') as fileToFilter:
         lines = fileToFilter.readlines()
     for line in lines:
         if guess_language(line.decode("utf-8")) == u"en":
             newLines.append(line)
     with open(filePathToFilter, 'w') as fileToFilter:
         for line in newLines:
             fileToFilter.write(line)
Пример #15
0
def strip_stopwords(words):
    text = ' '.join(words)
    language_code = guess_language.guess_language(text)
    if language_code == 'UNKNOWN':
        return words

    stopwords = munin.stopwords.load_stopwords(language_code)
    if not stopwords:
        return words

    return filter(lambda w: w not in stopwords, words)
Пример #16
0
	def on_success(self, status_data):		
		# check if the message has any text		
		if 'text' in status_data:
			valid = True
			if self.lang_filter:
				inferred_lang = guess_language(status_data['text'])
				valid = True if (inferred_lang == self.lang_filter or inferred_lang == UNKNOWN) else False				
			
			if valid:
				tweet = "@{author}: {text}".format(author=status_data['user']['screen_name'], text=status_data['text'])
				self.logger.debug(tweet)
Пример #17
0
def reply_post():
    form = PostForm(request.form)
    language = guess_language(form.body.data)
    if language == 'UNKNOWN' or len(language) > 5:
        language = ''
    parent = Post.query.filter_by(path=form.parent_id.data).first()
    post = Post(body=form.body.data,
                parent=parent,
                author=current_user,
                language=language)
    post.save()
    flash('Your post is now live!')
    # TODO: Figure out how to add post without redirecting; if redirecting is necessary, then have it redirect to the last known visted page
    return redirect(url_for('main.index'))
Пример #18
0
def normalize_namen(groups):
    new_groups = []
    for titles, members, sekretariat, konstituierung, zweck, art_der_aktivitaeten, mitgliederliste in groups:
        title_de = titles[0]
        title_fr = titles[1] if len(titles) > 1 else None
        title_it = titles[2] if len(titles) > 2 else None

        if (title_fr and not guess_language(title_fr, ['de', 'fr', 'it'])
                in ['fr', 'UNKNOWN']):
            print("Warning: title_fr '{}' guess lanuage is guessed '{}'\n".
                  format(title_fr, guess_language(title_fr,
                                                  ['de', 'fr', 'it'])))
        if (title_it and not guess_language(title_it, ['de', 'fr', 'it'])
                in ['it', 'UNKNOWN']):
            print("Warning: title_it '{}' guess lanuage is guessed '{}'\n".
                  format(title_it, guess_language(title_it,
                                                  ['de', 'fr', 'it'])))

        new_groups.append(
            (clean_whitespace(title_de), clean_whitespace(title_fr),
             clean_whitespace(title_it), members, sekretariat, konstituierung,
             zweck, art_der_aktivitaeten, mitgliederliste))
    return new_groups
Пример #19
0
def get_lang_guess(text: str) -> str:
    # Get language using guess
    result = ""

    try:
        result = guess_language(text)

        if not result or (result != "UNKNOWN"
                          and result in glovar.lang_protect):
            return ""
    except Exception as e:
        logger.info(f"Get lang guess error: {e}", exc_info=True)

    return result
Пример #20
0
def index():
    form = PostForm()
    if form.validate_on_submit():
        language = guess_language(form.post.data)
        if language == 'UNKNOWN' or len(language) > 5:
            language = ''
        post = Post(body=form.post.data,
                    author=current_user,
                    language=language)
        db.session.add(post)
        db.session.commit()
        """pref = Preferences(preferPost = Post.query.order_by(Post.id.desc()).first(),
                                                    science = bool(form.science.data),
                                                    sport = bool(form.sport.data),
                                                    people = bool(form.people.data),
                                                    policy = bool(form.policy.data))
                                db.session.add(pref)
                                db.session.commit()"""
        if form.file.data:
            print(Post.query.order_by(Post.id.desc()).first())
            print("Name saved file: ", form.file.data.name)

            newFile = FileContent(name=form.file.data.filename,
                                  data=form.file.data.read(),
                                  postId=Post.query.order_by(
                                      Post.id.desc()).first())

            db.session.add(newFile)
        db.session.commit()

        flash(_('Your post is now live!'))
        return redirect(url_for('main.index'))
    page = request.args.get('page', 1, type=int)
    posts = current_user.followed_posts().paginate(
        page, current_app.config['POSTS_PER_PAGE'], False)
    next_url = url_for('main.index', page=posts.next_num) \
        if posts.has_next else None
    prev_url = url_for('main.index', page=posts.prev_num) \
        if posts.has_prev else None

    posts = formatLaTeX(posts.items)
    leng = User.query.filter_by(username=current_user.username).first()
    len_post = leng.len_post
    return render_template('index.html',
                           title=_('Home'),
                           form=form,
                           posts=posts,
                           next_url=next_url,
                           prev_url=prev_url,
                           len_post=int(len_post))
    def run(self,
            filename,
            file,
            natural_language: str='auto',
            languagetool_disable_rules: typed_list(str)=()):
        '''
        Checks the code with LanguageTool.

        :param natural_language:           A locale representing the language
                                           you want to have checked. If set to
                                           'auto' the language is guessed.
                                           If the language cannot be guessed,
                                           'en-US' is used.
        :param languagetool_disable_rules: List of rules to disable checks for.
        '''
        # Defer import so the check_prerequisites can be run without
        # language_check being there.
        from language_check import LanguageTool, correct

        joined_text = ''.join(file)
        natural_language = (guess_language(joined_text)
                            if natural_language == 'auto'
                            else natural_language)
        natural_language = 'en-US' if not natural_language \
                           else natural_language

        tool = LanguageTool(natural_language, motherTongue='en_US')
        tool.disabled.update(languagetool_disable_rules)

        matches = tool.check(joined_text)
        for match in matches:
            if not match.replacements:
                diffs = None
            else:
                replaced = correct(joined_text, [match]).splitlines(True)
                diffs = {filename:
                         Diff.from_string_arrays(file, replaced)}

            rule_id = match.ruleId
            if match.subId is not None:
                rule_id += '[{}]'.format(match.subId)

            message = match.msg + ' (' + rule_id + ')'
            source_range = SourceRange.from_values(filename,
                                                   match.fromy+1,
                                                   match.fromx+1,
                                                   match.toy+1,
                                                   match.tox+1)
            yield Result(self, message, diffs=diffs,
                         affected_code=(source_range,))
Пример #22
0
def main():

    # empty lists to store the nepalese and english usernames
    nepalese = []
    english = []
    restrictions = ['”','…', '“', '’', 'ç', '‘', '..'] #guess_language detects these symbols as non-english

    location = 'd:\Users\user\Desktop\weets.csv'
    df = pd.read_csv(location) #Dataframe containing the orgiinal tweets
    for user in df.text:
        for rest_syb in restrictions:
            user = user.replace(rest_syb, '')
        try:

            if guess_language(user) == "en":
                english.append(user)

            elif guess_language(user) == "UNKNOWN":
                english.append(user)

            elif guess_language(user) == "ne":
                nepalese.append(user)

            else:
                english.append(user)

        except:
            nepalese.append(user)



    #Creating a dataset
    NameDataSet = list(zip(nepalese, english))


    name_types = pd.DataFrame(data = NameDataSet, columns = ['Nepali Text','English Text'])
    name_types.to_csv('tweets_filter2.csv',index= False, header= True)
Пример #23
0
def get_headline(topic, otit, topic_1_name):
    '''
    Gets the url of the topic.
    Parses the html for all the titles.
    Checks each title:
        wasn't used recently to generat a title,
        contains the topic,
        doesn't end in ellipsis (...),
        is in spanish.
        doesn't already contain the text that will be used as replacement
    If at least one title is valid it selects one randomly and returns it.
    '''
    valid_headlines = []
    sleep(1)
    tree = html.parse(urlopen(topic['url']))
    headlines = tree.xpath("//span[@class='titletext']")
    while len(headlines) > 0:
        headline = headlines.pop()
        regex = re.compile(r'\b{0}\b'.format(topic['name']), re.IGNORECASE)
        if headline.text_content() in otit:
            log.info('Headline recently used to generate title: %s',
                     headline.text_content())
            continue
        if regex.search(headline.text_content()) is None:
            log.debug('Invalid headline, regex failed on: %s',
                      headline.text_content())
            continue
        if headline.text_content()[-3:] == '...':
            log.debug('Invalid headline, ellipsis on: %s',
                      headline.text_content())
            continue
        if guess_language(headline.text_content()) != 'es':
            log.debug('Invalid headline, not spanish on: %s',
                      headline.text_content())
            continue
        if topic_1_name in headline.text_content():
            log.debug('Invalid headline %s, contains replacement: %s',
                      headline.text_content(), topic_1_name)
            continue
        valid_headlines.append(headline.text_content())
        log.debug('Valid headline: %s', headline.text_content())

    if len(valid_headlines) > 0:
        log.info('%s valid headlines found for topic %s',
                 len(valid_headlines), topic['name'])
        return (random.choice(valid_headlines))
    else:
        log.info('No valid headlines found for topic %s', topic['name'])
        return False
Пример #24
0
def get_headline(topic, otit, topic_1_name):
    '''
    Gets the url of the topic.
    Parses the html for all the titles.
    Checks each title:
        wasn't used recently to generat a title,
        contains the topic,
        doesn't end in ellipsis (...),
        is in spanish.
        doesn't already contain the text that will be used as replacement
    If at least one title is valid it selects one randomly and returns it.
    '''
    valid_headlines = []
    sleep(1)
    tree = html.parse(urlopen(topic['url']))
    headlines = tree.xpath("//span[@class='titletext']")
    while len(headlines) > 0:
        headline = headlines.pop()
        regex = re.compile(r'\b{0}\b'.format(topic['name']), re.IGNORECASE)
        if headline.text_content() in otit:
            log.info('Headline recently used to generate title: %s',
                     headline.text_content())
            continue
        if regex.search(headline.text_content()) is None:
            log.debug('Invalid headline, regex failed on: %s',
                      headline.text_content())
            continue
        if headline.text_content()[-3:] == '...':
            log.debug('Invalid headline, ellipsis on: %s',
                      headline.text_content())
            continue
        if guess_language(headline.text_content()) != 'es':
            log.debug('Invalid headline, not spanish on: %s',
                      headline.text_content())
            continue
        if topic_1_name in headline.text_content():
            log.debug('Invalid headline %s, contains replacement: %s',
                      headline.text_content(), topic_1_name)
            continue
        valid_headlines.append(headline.text_content())
        log.debug('Valid headline: %s', headline.text_content())

    if len(valid_headlines) > 0:
        log.info('%s valid headlines found for topic %s', len(valid_headlines),
                 topic['name'])
        return (random.choice(valid_headlines))
    else:
        log.info('No valid headlines found for topic %s', topic['name'])
        return False
Пример #25
0
def nlp():
    user = User.query.filter_by(username=current_user.username).first_or_404()
    form = PostForm()
    if form.validate_on_submit():
        language = guess_language(form.post.data)
        if language == 'UNKNOWN' or len(language) > 5:
            language = ''
        post = Post(body=form.post.data,
                    author=current_user,
                    language=language)
        db.session.add(post)
        db.session.commit()
        flash(_('Форма заполнена'))
        return redirect(url_for('main.nlp'))
    return render_template("nlp.html", form=form, user=user)
Пример #26
0
def send_message(recipient):
    user = User.query.filter_by(username=recipient).first_or_404()
    form = MessageForm()
    if form.validate_on_submit():
        language = guess_language(form.message.data)
        if language == 'UNKNOWN' or len(language) > 5:
            language = ''
        msg = Message(author=current_user, recipient=user, body=form.message.data, language=language)
        db.session.add(msg)
        user.add_notification('unread_message_count', user.new_messages())
        db.session.commit()
        flash('Your message has been sent.')
        return redirect(url_for('main.user', username=recipient)) # redirect (when and) only when form is successfully submitted
    return render_template('send_message.html', title='Send Message',
                           form=form, user=user)
Пример #27
0
def notes():
    form = PostForm()
    if form.validate_on_submit():
        language = guess_language(form.post.data)
        if language == 'UNKNOWN' or len(language) > 5:
            language = ''
        post = Post(body=form.post.data,
                    author=current_user,
                    language=language)
        db.session.add(post)
        db.session.commit()
        flash(_("Ваш пост опубликован!"))
        return redirect(url_for("main.notes"))
    posts = current_user.followed_posts().all()
    return render_template("main/notes.html", notes=posts, form=form)
Пример #28
0
def create_subreddit():
    form = CreateSubredditForm()
    if form.validate_on_submit():
        subreddit_language = guess_language(form.description.data)
        if subreddit_language == 'UNKNOWN' or len(subreddit_language) > 5:
            subreddit_language = ''
        new_subreddit = Subreddit(name=form.name.data,
                                  description=form.description.data,
                                  language=subreddit_language)
        db.session.add(new_subreddit)
        db.session.commit()
        return redirect(session['prior_thread_create_page'])
    return render_template('create_subreddit.html',
                           form=form,
                           page_title=_('Reddit - Create Subreddit'))
Пример #29
0
    def POST(self, text):
        """
        Return the guessed language of POSTed text.

        Args:
            text (str): text to guess the language of.
        Returns:
            A dict with the guessed language.
            For example:

            {'language': 'en'}

        """
        lang = guess_language(text)
        return {'language': lang}
Пример #30
0
def index():
	form=PostForm()
	if form.validate_on_submit():
		language = guess_language(form.post.data)
		if language == 'UNKNOWN' or len(language)>5:
			language=''
		post=Post(body=form.post.data,author=current_user,language=language)
		db.session.add(post)
		db.session.commit()
		flash(_("Your post has been created"))
		return redirect(url_for('main.index'))
	page=request.args.get('page',1,type=int)
	posts = current_user.followed_posts().paginate(page,current_app.config['POSTS_PER_PAGE'],False)
	next_url=url_for('main.index',page=posts.next_num) if posts.has_next else None
	prev_url=url_for('main.index',page=posts.prev_num) if posts.has_prev else None
	return render_template("index.html",title=_("home"),posts=posts.items,form=form,next_url=next_url,prev_url=prev_url)
Пример #31
0
def index():
    form=PostForm()
    if form.validate_on_submit():
        language=guess_language(form.post.data)
        if language=='UNKNOWN' or len(language)>10:
            language=""
        post=Post(body=form.post.data, author=current_user, language=language)
        db.session.add(post)
        db.session.commit()
        flash(_('Your post is now live!'))
        return redirect(url_for('index'))
    page=request.args.get('page',1,type=int)
    posts=current_user.followed_posts().paginate(page, app.config['POST_PER_PAGE'],False)
    next_url=url_for('index',page=posts.next_num) if posts.has_next else None
    prev_url=url_for('index',page=posts.prev_num) if posts.has_prev else None    
    return render_template('index.html',title=_('Home'), form=form, posts=posts.items,next_url=next_url,prev_url=prev_url)
Пример #32
0
def index():
	pform = PostForm()
	if pform.validate_on_submit():
		language = guess_language(pform.post.data)
		if language == "UNKNOWN" or len(language) > 5:
			language = ""
		post = Post(body=pform.post.data, author=current_user, language=language)
		db.session.add(post)
		db.session.commit()
		flash(_("Your post is now live!"))
		return redirect(url_for("main.index"))
	page = request.args.get("page", 1, type=int)
	posts = current_user.followed_posts().paginate(page, current_app.config["POSTS_PER_PAGE"], False)
	next_url = url_for("main.index", page=posts.next_num) if posts.has_next else None
	prev_url = url_for("main.index", page=posts.prev_num) if posts.has_prev else None
	return render_template("index.html", title=_("Home"), form=pform, posts=posts.items, next_url=next_url, prev_url=prev_url)
Пример #33
0
def new_ad():
    form = AdForm()
    if form.validate_on_submit():
        language = guess_language(form.description.data)
        if language == 'UNKNOWN' or len(language) > 5:
            language = ''
        new = Ad(title=form.title.data,
                 category=form.category.data,
                 description=form.description.data,
                 language=language,
                 author=current_user)
        db.session.add(new)
        db.session.commit()
        flash(_('New ad posted!'))
        return redirect(url_for('main.index'))
    return render_template('new_ad.html', title=_('New ad'), form=form)
Пример #34
0
def index():
    form = PostForm()   # 게시글 작성하기 폼
    if form.validate_on_submit():
        #새 게시물의 언어를 저장
        language = guess_language(form.post.data)
        if language == 'UNKNOWN' or len(language) > 5:
            language = ''
        # 언어가 알려지지 않은 상태로 돌아오거나 예기치 않게 긴 결과가 나오면 안전하게 재생하고 빈 문자열을 데이터베이스에 저장.
        # 언어가 빈 문자열로 설정된 게시물은 알 수 없는 언어로 간주된다는 규칙을 채택

        post = Post(body=form.post.data, author=current_user, language=language)
        db.session.add(post)
        db.session.commit()
        flash('Your post is now live!')
        return redirect(url_for('main.index'))
        # 웹 양식을 제출한 후 사용자가 실수로 페이지를 새로고침할 때 중복으로 게시물이 삽입되지 않도록 하기 위해 
        # redirection으로 응답해줌 -> Post/Redirect/Get 패턴  (그래서 POST,GET 두 경로에서 요청을 수락하도록 설정함 )
        # ==> POST 요청이 리디렉션으로 응답되면 이제 브라우저는 리디렉션에 표시된 페이지를 가져오기 위해 
        #     GET 요청을 보내도록 지시 받으므로 마지막 요청은 더 이상 POST 요청이 아님 -> 새로 고침할 때 중복 게시물 삽입되는 것을 방지
    """
        posts = [
            {
                'author': {'username': '******'},
                'body': 'Beautiful day in Portland!'
            },
            {
                'author': {'username': '******'},
                'body': 'The Avengers movie was so cool!'
            }
        ]
    """
    # posts = current_user.followed_posts().all() # 팔로우한 유저의 게시글들 가져오기
    # 페이지 매김해서 게시글 가져오기 
    page = request.args.get('page', 1 ,type=int)     # 1. page 쿼리 문자열 인수 또는 기본값 1에서 표시할 페이지 번호를 결정한 다음, 
    posts = current_user.followed_posts().paginate(  # 2. 원하는 결과 페이지만 검색하기 위해 paginate() 메서드 사용
        page, current_app.config['POSTS_PER_PAGE'], False)    # 페이지 크기를 결정하는 POSTS_PER_PAGE 구성 항목은 app.config 개체를 통해 액세스됩니다.
    
    # 다음 및 이전 페이지 링크 생성하기
    next_url = url_for('main.index', page=posts.next_num) \
        if posts.has_next else None
    prev_url = url_for('main.index', page=posts.prev_num) \
        if posts.has_prev else None
     # view 함수의 next_url 및 prev_url은 (Flask-SQLAlchemy의 Pagination 클래스 객체에 있음)
     # 해당 방향에 페이지가 있는 경우에만 url_for()에서 반환하는 URL로 설정.
                
    return render_template('index.html', title='Home Page', form=form, posts=posts.items, 
                                                            next_url=next_url, prev_url=prev_url)
Пример #35
0
def guess_external_subtitles(dest_folder, subtitles):
    for subtitle, language in subtitles.items():
        if not language:
            subtitle_path = os.path.join(dest_folder, subtitle)
            if os.path.exists(subtitle_path) and os.path.splitext(
                    subtitle_path)[1] in core.SUBTITLE_EXTENSIONS:
                logging.debug(
                    "BAZARR falling back to file content analysis to detect language."
                )
                detected_language = None

                # to improve performance, skip detection of files larger that 1M
                if os.path.getsize(subtitle_path) > 1 * 1024 * 1024:
                    logging.debug(
                        "BAZARR subtitles file is too large to be text based. Skipping this file: "
                        + subtitle_path)
                    continue

                with open(subtitle_path, 'rb') as f:
                    text = f.read()

                try:
                    guess = chardet.detect(text)
                    logging.debug('BAZARR detected encoding %r', guess)
                    text = text.decode(guess["encoding"])
                    detected_language = guess_language(text)
                except (UnicodeDecodeError, TypeError):
                    logging.exception(
                        "BAZARR subtitles file doesn't seems to be text based. Skipping this file: "
                        + subtitle_path)
                except:
                    logging.exception(
                        'BAZARR Error trying to detect language for this subtitles file: '
                        + subtitle_path +
                        ' You should try to delete this subtitles file manually and ask '
                        'Bazarr to download it again.')
                else:
                    if detected_language:
                        logging.debug(
                            "BAZARR external subtitles detected and guessed this language: "
                            + str(detected_language))
                        try:
                            subtitles[subtitle] = Language.rebuild(
                                Language.fromietf(detected_language))
                        except:
                            pass
    return subtitles
Пример #36
0
def post_comment():
	form = g.comment_form
	post_id = request.form['id']
	body = request.form['comment']
	post = Post.query.filter_by(id=post_id).first()
	language = guess_language(body)
	if language == 'UNKNOWN' or len(language) > 5:
		language = ''
	comment = Comment(body=body,language=language,post_id=int(post_id),user_id=current_user.id)
	db.session.add(comment)
	db.session.commit()

	return jsonify({
					'comment_num':post.comments.count(),
					'avatarURL':current_user.avatar(70),
					'comment_username':current_user.username,
		})
Пример #37
0
def index(page=1):
    form = PostForm()
    if form.validate_on_submit():
        now_time = datetime.utcnow()
        language = guess_language(form.post.data)
        if language == 'UNKNOW' or len(language)  > 5:
            language = ''
        post = Post(body=form.post.data, timestamp=now_time, author=g.user,
                    language=language)
        db.session.add(post)
        db.session.commit()
        flash('Your post is now live')
        return redirect(url_for('auth.index'))
    user = g.user
    posts = user.followed_posts().paginate(page,
                                           current_app.config['POSTS_AVG_PAGE'], False)
    return render_template('index.html', title='Home', user=user, posts=posts, form=form)
Пример #38
0
def new_post():
    if not current_user.can('NEW_POST'):
        abort(403)

    form = PostForm()
    if form.validate_on_submit():
        language = guess_language(form.post.data)
        if language == 'UNKNOWN' or len(language) > 5:
            language = ''
        category = Category.query.get(form.category.data)
        post = Post(title=form.title.data, body=form.post.data, author=current_user,
                    language=language, category=category)
        db.session.add(post)
        db.session.commit()
        flash(_('Your post is now live!'))
        return redirect(url_for('main.show_post', post_id=post.id))
    return render_template('new_post.html', form=form)
Пример #39
0
def articleeditorWTF():
    ckarticle = CKarticle()
    if ckarticle.validate_on_submit():
        language = guess_language(ckarticle.content.data)
        if language == 'UNKNOWN' or len(language) > 5:
            language = ''
        article = Article(body=ckarticle.content.data,
                          author=current_user,
                          language=language,
                          title=ckarticle.title.data)
        db.session.add(article)
        db.session.commit()
        flash(_('Your article is now live!'))
        # return json.dumps({'body': str(form.post.data), 'author': current_user,'language': language, 'title': str(form.title.data)})
        return redirect(url_for('main.articleditorWTF'))
    # flash(current_user.username)
    return render_template('/ckeditor/articleEditorWTF.html', form=ckarticle)
Пример #40
0
def register():
    if current_user.is_authenticated:
        return redirect(url_for('index'))
    form = RegistrationForm()
    if form.validate_on_submit():
        language = guess_language(form.username.data)
        if language == "UNKNOWN" or len(language) > 5:
            language = ''
        user = User(username=form.username.data,
                    email=form.email.data,
                    language=language)
        user.set_password(form.password.data)
        db.session.add(user)
        db.session.commit()
        flash('Congratulations, you are now a registered user!')
        return redirect(url_for('login'))
    return render_template('register.html', title='Register', form=form)
Пример #41
0
def index():
    form = PostForm()
    if form.validate_on_submit():
        language = guess_language(form.post.data)
        if language == 'UNKNOWN' or len(language) > 5:
            language = ''
        post = Post(body=form.post.data, author=current_user, language=language)
        db.session.add(post)
        db.session.commit()
        flash(_('Postado com sucesso!'))
        return redirect(url_for('main.index'))

    page = request.args.get('page', 1, type=int)
    pagination = current_user.followed_posts().paginate(page, current_app.config['POSTS_PER_PAGE'], False)
    posts = pagination.items
    return render_template('index.html', title=_('Página inicial'), posts=posts, form=form, pagination=pagination,
                           endpoint='main.index')
Пример #42
0
def extract_keywords(text, use_stemmer=True):
    """Extract the keywords from a certain text.

    :param use_stemmer: If True a Snowball Stemmer will be used for all words.
    :returns: A sorted mapping between a set of keywords and their rating.
    :rtype: :class:`collections.OrderedDict`
    """
    language_code = guess_language.guess_language(text)
    phrases = extract_phrases(split_sentences(text), language_code, use_stemmer)

    # This can happen if no stopwords are available, or a one-word input was used.
    if phrases is None:
        return None, OrderedDict()

    scores = word_scores(phrases)
    keywords = candidate_keywordscores(phrases, scores)
    return language_code, filter_subsets(keywords)
def get_txt(df, field, output_path):
    df.fillna('', inplace=True)
    output_str = ''
    tot = df.shape[0]
    cnt = 0
    for idx, row in df.iterrows():
        try:
            if row[field] is not '' and guess_language(row[field]) == 'en':
                output_str += (row[field].encode('utf-8') + '\n')
        except Exception as e:
            print type(e)
            print e.args
            print e
        cnt += 1
        print '%d/%d' % (cnt, tot)
    with open(output_path, 'w') as f:
        f.write(output_str.replace('&amp', ''))
Пример #44
0
def index():
    form = PostForm()
    if form.validate_on_submit():
        language = guess_language(form.post.data)
        if language == 'UNKNOWN' or len(language) > 5:
            language = ''
        post = Post(body=form.post.data,
                    author=current_user,
                    language=language)
        db.session.add(post)
        db.session.commit()
        flash(_('Your post is now live', category='info'))
        return redirect(url_for('main.index'))
    return render_template('index.html',
                           title='Home Page',
                           form=form,
                           posts=Post.query.all())
Пример #45
0
def guess_external_subtitles(dest_folder, subtitles):
    for subtitle, language in six.iteritems(subtitles):
        if not language:
            subtitle_path = os.path.join(dest_folder, subtitle)
            if os.path.exists(subtitle_path) and os.path.splitext(
                    subtitle_path)[1] in core.SUBTITLE_EXTENSIONS:
                logging.debug(
                    "BAZARR falling back to file content analysis to detect language."
                )
                if is_binary(subtitle_path):
                    logging.debug(
                        "BAZARR subtitles file doesn't seems to be text based. Skipping this file: "
                        + subtitle_path)
                    continue
                detected_language = None

                if six.PY3:
                    with open(subtitle_path, 'r', errors='ignore') as f:
                        text = f.read()
                else:
                    with open(subtitle_path, 'r') as f:
                        text = f.read()

                try:
                    encoding = UnicodeDammit(text)
                    if six.PY2:
                        text = text.decode(encoding.original_encoding)
                    detected_language = guess_language(text)
                except Exception as e:
                    logging.exception(
                        'BAZARR Error trying to detect language for this subtitles file: '
                        + subtitle_path +
                        ' You should try to delete this subtitles file manually and ask '
                        'Bazarr to download it again.')
                else:
                    if detected_language:
                        logging.debug(
                            "BAZARR external subtitles detected and guessed this language: "
                            + str(detected_language))
                        try:
                            subtitles[subtitle] = Language.rebuild(
                                Language.fromietf(detected_language))
                        except:
                            pass
    return subtitles
Пример #46
0
def index(page=1):
    form = PostForm()
    if form.validate_on_submit():
        language = guess_language(form.post.data)
        if language == 'UNKNOWN':
            language = 'en'
        elif  len(language) > 5:
            language = ''
        post = Post(body=form.post.data, timestamp=datetime.utcnow(), author=g.user, language=language)
        db.session.add(post)
        db.session.commit()
        flash( gettext('Your post is now live!') )
        return redirect(url_for('index'))
    posts = g.user.followed_posts().paginate(page, POSTS_PER_PAGE, False)
    return render_template('index.html',
        title = 'Home',
        form = form,
        posts = posts)
Пример #47
0
def edit_profile():
    form = EditProfileForm(current_user.username)
    if form.validate_on_submit():
        current_user.username = form.username.data
        current_user.about_me = form.about_me.data
        language = guess_language(form.about_me.data)
        if language == 'UNKNOWN' or len(language) > 5:
            language = ''
        current_user.language = language
        db.session.commit()
        flash(_('Your changes have been saved.'))
        return redirect(url_for('main.edit_profile'))
    elif request.method == 'GET':
        form.username.data = current_user.username
        form.about_me.data = current_user.about_me
    return render_template('edit_profile.html',
                           title=_('Edit Profile'),
                           form=form)
Пример #48
0
def index(page=1):
    form = PostForm()
    if form.validate_on_submit():
        language = guess_language(form.post.data)
        if language == 'UNKNOWN':
            language = 'en'
        elif len(language) > 5:
            language = ''
        post = Post(body=form.post.data,
                    timestamp=datetime.utcnow(),
                    author=g.user,
                    language=language)
        db.session.add(post)
        db.session.commit()
        flash(gettext('Your post is now live!'))
        return redirect(url_for('index'))
    posts = g.user.followed_posts().paginate(page, POSTS_PER_PAGE, False)
    return render_template('index.html', title='Home', form=form, posts=posts)
Пример #49
0
def index():
    form = PostForm()
    if form.validate_on_submit():
        language = guess_language(form.post.data)
        if language == 'UNKNOWN' or len(language) > 5:
            language = ''
        post = Post(body=form.post.data,
                    author=current_user,
                    language=language)
        db.session.add(post)
        db.session.commit()
        flash('Your post is now live!')
        return redirect(url_for('index'))
    posts = current_user.followed_posts().all()
    return render_template("index.html",
                           title='Home Page',
                           form=form,
                           posts=posts)
Пример #50
0
    def run(self,
            filename,
            file,
            language: str='auto',
            languagetool_disable_rules: typed_list(str)=()):
        '''
        Checks the code with LanguageTool.

        :param language:                   A locale representing the language
                                           you want to have checked. If set to
                                           'auto' the language is guessed.
                                           If the language cannot be guessed,
                                           'en-US' is used.
        :param languagetool_disable_rules: List of rules to disable checks for.
        '''
        joined_text = "".join(file)
        language = (guess_language(joined_text)
                    if language == 'auto' else language)
        language = 'en-US' if not language else language

        tool = LanguageTool(language, motherTongue="en_US")
        tool.disabled.update(languagetool_disable_rules)

        matches = tool.check(joined_text)
        for match in matches:
            if not match.replacements:
                diffs = None
            else:
                replaced = correct(joined_text, [match]).splitlines(True)
                diffs = {filename:
                         Diff.from_string_arrays(file, replaced)}

            rule_id = match.ruleId
            if match.subId is not None:
                rule_id += '[{}]'.format(match.subId)

            message = match.msg + ' (' + rule_id + ')'
            source_range = SourceRange.from_values(filename,
                                                   match.fromy+1,
                                                   match.fromx+1,
                                                   match.toy+1,
                                                   match.tox+1)
            yield Result(self, message, diffs=diffs,
                         affected_code=(source_range,))
Пример #51
0
def index():
    form = PostForm()
    if form.validate_on_submit():
        language = guess_language(form.post.data)
        if language == 'UNKNOWN' or len(language) > 5:
            language = ''
        post = Post(body=form.post.data, author=current_user,
                    language=language)
        db.session.add(post)
        db.session.commit()
        flash(_('Your post is now live!'))
        return redirect(url_for('main.index'))
    page = request.args.get('page', 1, type=int)
    posts = current_user.followed_posts().paginate(
        page, current_app.config['POSTS_PER_PAGE'], False)
    next_url = url_for('main.index', page=posts.next_num) \
        if posts.has_next else None
    prev_url = url_for('main.index', page=posts.prev_num) \
        if posts.has_prev else None
    return render_template('index.html', title=_('Home'), form=form,
                           posts=posts.items, next_url=next_url,
                           prev_url=prev_url)
Пример #52
0
def index(page=1):
    post_form = PostForm()
    if post_form.validate_on_submit():
        # Form validation successful.
        # Store the post into database.
        # The following 4 lines (creating the post object and setting values) could have been done in just
        # one line: post = Post(body = post_form.post.data, timestamp = datetime.utcnow(), author = g.user)
        post = Post()
        post.body = post_form.post.data
        post.timestamp = datetime.utcnow()
        post.user_id = g.user.id              # this could have been done as post.author = g.user
        language = guess_language(post.body)
        # we'll try to automatically detect the post language
        if language == 'UNKNOWN' or len(language) > 5:
            language = ''
        post.language = language
        db.session.add(post)
        db.session.commit()
        flash('Your post is now live!', 'info')
        return redirect(url_for('index'))
    # If we're here then we one of these things happened:
    #    a) the page is just opening, or
    #    b) form validation failed
    # Either way, show the index page with posts.
    #
    # This was how I did the pagination, but there's a far better way to do that in Flask:
    # total_posts = g.user.followed_posts().count()
    # first_page = None if total_posts <= POSTS_PER_PAGE else 1
    # prev_page = None if page == 1 else page-1
    # last_page = int(ceil(total_posts/POSTS_PER_PAGE))
    # next_page = None if page == last_page else page + 1
    posts = g.user.followed_posts().paginate(page, POSTS_PER_PAGE, False)   # this is a Paginate object
    return render_template('index.html',
                           title='Home',
                           user=g.user,
                           posts=posts,
                           form=post_form)
def find_language(string):
    return guess_language(string)
Пример #54
0
def main():
    args = parse_args()

    status = 0

    for filename in args.files:
        if len(args.files) > 1:
            print(filename, file=sys.stderr)

        if filename == "-":
            filename = sys.stdin.fileno()
            encoding = args.encoding or (sys.stdin.encoding if sys.stdin.isatty() else locale.getpreferredencoding())
        else:
            encoding = args.encoding or "utf-8"

        lang_tool = LanguageTool(motherTongue=args.mother_tongue)
        guess_language = None

        try:
            text = get_text(filename, encoding, ignore=args.ignore_lines)
        except UnicodeError as exception:
            print("{}: {}".format(filename, exception), file=sys.stderr)
            continue

        if args.language:
            if args.language.lower() == "auto":
                try:
                    from guess_language import guess_language
                except ImportError:
                    print("guess_language is unavailable.", file=sys.stderr)
                    return 1
                else:
                    language = guess_language(text)
                    if not args.api:
                        print("Detected language: {}".format(language), file=sys.stderr)
                    if not language:
                        return 1
                    lang_tool.language = language
            else:
                lang_tool.language = args.language

        if not args.spell_check:
            lang_tool.disable_spellchecking()

        lang_tool.disabled.update(args.disable)
        lang_tool.enabled.update(args.enable)

        try:
            if args.api:
                print_unicode(lang_tool._check_api(text).decode())
            elif args.apply:
                print_unicode(lang_tool.correct(text))
            else:
                for match in lang_tool.check(text):
                    rule_id = match.ruleId
                    if match.subId is not None:
                        rule_id += "[{}]".format(match.subId)

                    replacement_text = ", ".join("'{}'".format(word) for word in match.replacements).strip()

                    message = match.msg

                    # Messages that end with punctuation already include the
                    # suggestion.
                    if replacement_text and not message.endswith((".", "?")):
                        message += "; suggestions: " + replacement_text

                    print_unicode(
                        "{}:{}:{}: {}: {}".format(filename, match.fromy + 1, match.fromx + 1, rule_id, message)
                    )

                    status = 2
        except Error as exception:
            print("{}: {}".format(filename, exception), file=sys.stderr)
            continue

    return status
Пример #55
0
def parse_tweets():    
    db = sqlite3.connect(dbfile_)
    cursor = db.cursor()
    cursor.execute("PRAGMA journal_mode = WAL")
    
    if cursor.fetchone()[0] != "wal":
        print "Could not set journal_mode!"
    
    while(running_):
        tweet = queue_.get()
        
        # place tweet into a city
        if not tweet['place'] or not tweet['place']['bounding_box']:
            continue
            
        poly = asShape(tweet['place']['bounding_box'])
        city = None
        for c,p in AREAS.items():
            if p.intersects(poly):
                city = c
                break
        
        if not city:
            city = "unknown"
        
        # parse the tweet for unique words and figure out the language
        tweet_text = tweet['text'].encode('utf-8')
        cleaned = [m.group() for m in (WORD_REGEX.match(t) for t in tweet_text.split()) if m]
        lang = guess_language(" ".join(cleaned))
        words = [w.lower().translate(None, '.?!,\"').replace("'s",'') for w in cleaned]
        
        if words and lang == 'en':
            tstamp = int(time.time())
            
            # save to sqlite db and to a set of files because we don't completely 
            # trust the sqlite db to not be corrupted at some point
            cursor.execute("INSERT INTO tweets VALUES (?, (SELECT city_id FROM cities WHERE name=?), ?)", \
                           (tstamp, city, " ".join(cleaned)))
            
            for w in words:
                cursor.execute("INSERT OR IGNORE INTO vals(val) VALUES (?)", (w,))
                cursor.execute("INSERT INTO word VALUES " \
                               "(?, (SELECT city_id FROM cities WHERE name=?), (SELECT val_id FROM vals WHERE val=?))", 
                               (tstamp, city, w))
            
            if tweet['entities']['hashtags']:
                tags = [m.group().lower() for m in (HASH_REGEX.match(h['text']) for h in tweet['entities']['hashtags']) if m]
                
                for t in tags:
                    cursor.execute("INSERT OR IGNORE INTO vals(val) VALUES (?)", (t,))
                    cursor.execute("INSERT INTO hash VALUES " \
                                   "(?, (SELECT city_id FROM cities WHERE name=?), (SELECT val_id FROM vals WHERE val=?))", 
                                   (tstamp, city, t))
                
            if tweet['entities']['urls']:
                urls = [u['expanded_url'] for u in tweet['entities']['urls']]
                
                for u in urls:
                    cursor.execute("INSERT OR IGNORE INTO vals(val) VALUES (?)", (u,))
                    cursor.execute("INSERT INTO link VALUES " \
                                   "(?, (SELECT city_id FROM cities WHERE name=?), (SELECT val_id FROM vals WHERE val=?))", 
                                   (tstamp, city, u))
            
            db.commit()
    
    # done collecting, clean up
    db.commit()
    db.close()
Пример #56
0
# filters out tweets in non-english language

__author__ = 'Marie'

from guess_language import guess_language

filename = 'C:\\Users\Marie\\Documents\\Project\\Data\\tweets2009-06.txt\\processed_tweets2009-06.txt'

f = open(filename, 'r')
out = open(filename + "_filtered", 'w')
i = 0
for s in f:
    ssplitted = s.split(",.||")
    if (len(ssplitted) > 3 ):
        try:
            # if tweet is in eng, we output it to the final file.
            if (guess_language(unicode(ssplitted[3])) == 'en'):
                out.write(s + "\n")
        except Exception:
            pass
    i = i + 1
    # track progress
    if (i % 10000 == 0):
        print i
f.close()
out.close()
Пример #57
0
			# Retrieve the original utf-8 codification on the text and eliminate hashtags and cite
			# Inside the function calls the slang translation before eliminating marks
			utftext = put_readable(tweet['text'].decode('utf-8'), slg)
			
			# Count uppercases and marks 
			tweet['uppercases'] = n_upper_chars(utftext)
			tweet['marks'] = n_marks_chars(utftext)

# 			tweet['uppercases'] /= (float(len(utftext)) + 0.000001)
# 			tweet['marks'] /= (float(len(utftext)) + 0.000001)
# 			
			# Remove useless punctuation and put everything in lower case
			utftext = lower_punct(utftext)
			
			# Guess the language of the text and eliminate everything that is not English
			tw_lang = guess_language(utftext)
			if not (tw_lang == 'en'):
				archive_list.remove(tweet)
				
			else:
				#tweet['text_processed_unigrams'] = nltk.word_tokenize(utftext)
				tweet['text_processed_unigrams'] = utftext.split()
				tweet['text_processed_bigrams'] = nltk.bigrams(tweet['text_processed_unigrams'])
		
				utftext = " ".join(word for word in spell_correct(tweet['text_processed_unigrams'], Dict))
	
				# Process remaining features and save them into a dictionary
				tweet['rawvulgarity'] = process_vulgarity(tweet['text_processed_unigrams'], pwl)
				tweet['vulgarity'] = process_insults(tweet['text_processed_unigrams'], pwl)
				tweet['unpoliteness'] = process_politeness(tweet['text_processed_unigrams'], tweet['text_processed_bigrams'])
				tweet['disagreement'] = process_vs(utftext)
Пример #58
0
def is_english(text):
    lngid_res = guess_language.guess_language(text.decode("utf-8"))
    return lngid_res == 'en'
Пример #59
0
 def analyze(self, movie, attr_name='plot'):
     lang = str(guess_language(movie.attributes.get(attr_name) or ''))
     movie.analyzer_data[self.name] = lang