def save(self): print(self.file.name) super(OneFile, self).save() if len(self.name) == 0: try: modelName = self.__class__._meta.verbose_name.title() self.name = ' '.join([modelName, self.product.category.name, self.product.name, self.product.code]) except: self.name = self.__class__.__name__ self.slug = slugify(unidecode.unidecode(self.name) + '-' + str(self.pk)) if self.file.url != '/media/' + self.file_url: path, name, ext = imageFilename(self, self.file.name) self.ext = ext.lower() # f_name = filename = f'{path}/{self.slug}.{ext}' tempFile = self.file.name with open(settings.MEDIA_ROOT + self.file.name, "rb") as f1: raw = f1.read() with open(settings.MEDIA_ROOT + filename, 'wb') as f: f.write(raw) setattr(getattr(self, 'file'), 'name', filename) setattr(self, 'file_url', filename) try: os.remove(settings.MEDIA_ROOT + tempFile) except: pass super(OneFile, self).save()
def corpus_from_strings(strings, metadata=[], decode=False, nltk_stop=True, stop_freq=0, add_stop=None, tokenizer=word_tokenize): """ Takes a list of strings and returns a Corpus object whose document tokens are the strings. :param tokenizer: word tokenization function. Defaults to `vsm.extensions.corpusbuilders.util.word_tokenize`. :type tokenizer: lambda s -> tokens """ if decode: import unidecode for i in xrange(len(strings)): if isinstance(strings[i], unicode): strings[i] = unidecode.unidecode(strings[i]) documents = [tokenizer(s) for s in strings] corpus = sum(documents, []) indices = np.cumsum([len(d) for d in documents]) del documents if len(metadata) == 0: metadata = ['document_{0}'.format(i) for i in xrange(len(strings))] md_type = np.array(metadata).dtype dtype = [('idx', np.int), ('document_label', md_type)] context_data = [np.array(zip(indices, metadata), dtype=dtype)] c = Corpus(corpus, context_data=context_data, context_types=['document']) return apply_stoplist(c, nltk_stop=nltk_stop, freq=stop_freq, add_stop=add_stop)
def preprocessing_mot(text): prepro = nlp(text) lem = ' '.join(token.lemma_ for token in prepro if token.lemma_ not in stop_words) s = re.sub(r'[^\w\s]', '', lem) s = s.lower() s = unidecode.unidecode(s) return (s)
def info_articles(article_link): req = requests.get(article_link) data = req.text soup = BeautifulSoup(data, "lxml") title = unidecode.unidecode(soup.find('title').string) newspaper = "Le Monde" # Article theme if (soup.find("li", class_="ariane z2")): theme = soup.find("li", class_="ariane z2").find("a").get_text() else: theme = 'Forum' # Author of the article if (soup.find("span", class_="auteur")): if (soup.find("span", class_="auteur").a): author = soup.find("span", class_="auteur").find("a").get_text() else: author = soup.find("span", class_="auteur").get_text() author = re.sub(r"\s\s+", " ", author) author = re.sub(r"^ ", "", author) else: author = "" # publication date da = re.search(r"\d{4}-\d{2}\-\d{2}", soup.find("time").get("datetime"))[0] if (da): date_p = date.datetime.strptime(da, "%Y-%m-%d").strftime("%d/%m/%Y") else: date_p = str(date.datetime.now().strftime("%d/%m/%Y")) # Article content content = "" for div in soup.find_all('div'): for p in div.find_all('p'): content += p.get_text() + " " content = unidecode.unidecode(re.sub(r"\s\s+", " ", content)) new_article = utilsg4.recovery_article(title, newspaper, [author], date_p, content, theme) return new_article
def preProcess(column): import unidecode column = column.decode("utf8") column = unidecode.unidecode(column) column = re.sub(' +', ' ', column) column = re.sub('\n', ' ', column) column = column.strip().strip('"').strip("'").lower().strip() if not column: column = None return column
def preProcess(column): import unidecode column = column.decode("utf8") column = unidecode.unidecode(column) column = re.sub(' +', ' ', column) column = re.sub('\n', ' ', column) column = column.strip().strip('"').strip("'").lower().strip() if not column : column = None return column
def get_debug_file(): """ to test: curl http://127.0.0.1:5000/debug?image=out-7.jpg --output toto.jpg """ if request.method == 'GET': image_name = request.args.get('image') if image_name is None: abort(400, description="'image' parameter is mandatory") elif image_name in os.listdir(app.config['OUTPUT_FOLDER']): return send_from_directory(app.config["OUTPUT_FOLDER"], image_name) elif unidecode.unidecode(image_name.replace(' ', '_')) in os.listdir( app.config['OUTPUT_FOLDER']): return send_from_directory( app.config["OUTPUT_FOLDER"], unidecode.unidecode(image_name.replace(' ', '_'))) else: abort(404, description="image '%s' not found" % image_name)
def preProcess(column): """ Do a little bit of data cleaning with the help of Unidecode and Regex. Things like casing, extra spaces, quotes and new lines can be ignored. """ import unidecode #column = column.decode("utf8") column = unidecode.unidecode(column) column = re.sub(' +', ' ', column) column = re.sub('\n', ' ', column) column = column.strip().strip('"').strip("'").lower().strip() return column
def transform_view(): file = request.files['data_file'] if not file: return "No file" file_contents = file.stream.read().decode("utf-8") file_contents = file_contents.replace('\r', '') file_contents = file_contents.split('\n') cols = file_contents[0].split(',') #print(list(reader(file_contents[1:]))[0]) df_data = list(reader([unidecode.unidecode(x) for x in file_contents[1:]])) df = pd.DataFrame(df_data, columns = cols) result = transform(df) response = jsonify(result) #response = make_response(result) response.headers["Content-Disposition"] = "attachment; filename=odsc_east.json" return response
def convert_to_eval_rec(r): rec = {} if r: address = r.get('locations', [{}])[0].get('address1', '') + r.get( 'locations', [{}])[0].get('address2', '') digits = addr_reg.match(address) if digits: rec['address_number'] = digits.groups()[0] rec['address'] = address[len(rec['address_number']):] else: rec['address_number'] = '' rec['address'] = address rec['applicationname'] = r.get('source', {}).get('applicationName', '') rec['country'] = r.get('locations', [{}])[0].get('countryCode', '') email = r.get('locations', [{}])[0].get('email', '').split('@') if len(email) >= 2: rec['emailname'] = email[0] rec['emaildomain'] = email[1] else: rec['emailname'] = email[0] rec['emaildomain'] = '' rec['fax'] = r.get('locations', [{}])[0].get('fax', '') rec['locality'] = r.get('locations', [{}])[0].get('city', '') rec['name'] = r.get('name', '').strip() rec['contact_name'] = r.get('contact_name', '').strip() rec['original_xoid'] = str(r.get('source', {}).get('id', '')) rec['postcode'] = r.get('locations', [{}])[0].get('postalCode', '') rec['region'] = r.get('locations', [{}])[0].get('state', '') rec['status'] = '' rec['tel'] = r.get('locations', [{}])[0].get('phone', '') rec['website'] = r.get('website', '') rec['xoid'] = str(r.get('source', {}).get('id', '')) return json.loads(unidecode.unidecode(json.dumps(rec)))
except sr.UnknownValueError: print('Google Speech Recognition could not understand audio') # playSound() # arduino.emit("WARNING") except sr.RequestError as e: print( 'Could not request results from Google Speech Recognition service' ) # playSound() # arduino.emit("WARNING") except Exception as e: print(e) # playSound() # arduino.emit("WARNING") removeSpeech(FILE) try: text = unidecode(text) except: text = '' return text if __name__ == '__main__': text = unidecode.unidecode(speechRecognition()) print("text: ", text) print('Do somethings ...')
def remove_acento(self, texto): try: return unidecode.unidecode(texto) except: return normalize('NFKD', texto.encode('iso-8859-1').decode('iso-8859-1')).encode('ASCII','ignore').decode()
def remove_special_chars(text): return unidecode.unidecode(text)