示例#1
0
    def save(self):
        print(self.file.name)
        super(OneFile, self).save()

        if len(self.name) == 0:
            try:
                modelName =  self.__class__._meta.verbose_name.title()
                self.name = ' '.join([modelName, self.product.category.name, self.product.name, self.product.code])
            except:
                self.name = self.__class__.__name__
      

        self.slug = slugify(unidecode.unidecode(self.name) + '-' + str(self.pk))
        if self.file.url != '/media/' + self.file_url:
            path, name, ext = imageFilename(self, self.file.name)
            self.ext = ext.lower()
            
            # f_name = 

            filename =  f'{path}/{self.slug}.{ext}'
            tempFile = self.file.name
            with open(settings.MEDIA_ROOT + self.file.name, "rb") as f1:
                raw = f1.read()
                with open(settings.MEDIA_ROOT + filename, 'wb') as f:
                    f.write(raw)

            setattr(getattr(self, 'file'), 'name', filename)
            setattr(self, 'file_url', filename)

            try: os.remove(settings.MEDIA_ROOT + tempFile)
            except: pass
            
        super(OneFile, self).save()
示例#2
0
def corpus_from_strings(strings, metadata=[], decode=False,
                        nltk_stop=True, stop_freq=0, add_stop=None, tokenizer=word_tokenize):
    """
    Takes a list of strings and returns a Corpus object whose document
    tokens are the strings.
    :param tokenizer: word tokenization function. Defaults to `vsm.extensions.corpusbuilders.util.word_tokenize`.
    :type tokenizer: lambda s -> tokens

    """
    if decode:
        import unidecode
        for i in xrange(len(strings)):
            if isinstance(strings[i], unicode):
                strings[i] = unidecode.unidecode(strings[i])

    documents = [tokenizer(s) for s in strings]
    corpus = sum(documents, [])
    indices = np.cumsum([len(d) for d in documents])
    del documents

    if len(metadata) == 0:
        metadata = ['document_{0}'.format(i) for i in xrange(len(strings))]
    md_type = np.array(metadata).dtype
    dtype = [('idx', np.int), ('document_label', md_type)]
    context_data = [np.array(zip(indices, metadata), dtype=dtype)]

    c = Corpus(corpus, context_data=context_data, context_types=['document'])
    return apply_stoplist(c, nltk_stop=nltk_stop,
                          freq=stop_freq, add_stop=add_stop)
示例#3
0
def preprocessing_mot(text):
    prepro = nlp(text)
    lem = ' '.join(token.lemma_ for token in prepro
                   if token.lemma_ not in stop_words)
    s = re.sub(r'[^\w\s]', '', lem)
    s = s.lower()
    s = unidecode.unidecode(s)
    return (s)
def info_articles(article_link):
    req = requests.get(article_link)
    data = req.text
    soup = BeautifulSoup(data, "lxml")

    title = unidecode.unidecode(soup.find('title').string)

    newspaper = "Le Monde"

    # Article theme
    if (soup.find("li", class_="ariane z2")):
        theme = soup.find("li", class_="ariane z2").find("a").get_text()
    else:
        theme = 'Forum'

    # Author of the article
    if (soup.find("span", class_="auteur")):
        if (soup.find("span", class_="auteur").a):
            author = soup.find("span", class_="auteur").find("a").get_text()
        else:
            author = soup.find("span", class_="auteur").get_text()
        author = re.sub(r"\s\s+", " ", author)
        author = re.sub(r"^ ", "", author)
    else:
        author = ""

    # publication date
    da = re.search(r"\d{4}-\d{2}\-\d{2}", soup.find("time").get("datetime"))[0]
    if (da):
        date_p = date.datetime.strptime(da, "%Y-%m-%d").strftime("%d/%m/%Y")
    else:
        date_p = str(date.datetime.now().strftime("%d/%m/%Y"))

    # Article content
    content = ""
    for div in soup.find_all('div'):
        for p in div.find_all('p'):
            content += p.get_text() + " "
    content = unidecode.unidecode(re.sub(r"\s\s+", " ", content))

    new_article = utilsg4.recovery_article(title, newspaper, [author], date_p,
                                           content, theme)

    return new_article
def preProcess(column):
    import unidecode
    column = column.decode("utf8")
    column = unidecode.unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    if not column:
        column = None
    return column
def preProcess(column):
    import unidecode
    column = column.decode("utf8")
    column = unidecode.unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    if not column :
        column = None
    return column
示例#7
0
def get_debug_file():
    """
    to test: curl http://127.0.0.1:5000/debug?image=out-7.jpg --output toto.jpg
    """

    if request.method == 'GET':

        image_name = request.args.get('image')

        if image_name is None:
            abort(400, description="'image' parameter is mandatory")
        elif image_name in os.listdir(app.config['OUTPUT_FOLDER']):
            return send_from_directory(app.config["OUTPUT_FOLDER"], image_name)
        elif unidecode.unidecode(image_name.replace(' ', '_')) in os.listdir(
                app.config['OUTPUT_FOLDER']):
            return send_from_directory(
                app.config["OUTPUT_FOLDER"],
                unidecode.unidecode(image_name.replace(' ', '_')))
        else:
            abort(404, description="image '%s' not found" % image_name)
示例#8
0
def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """
    import unidecode
    #column = column.decode("utf8")
    column = unidecode.unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    return column
示例#9
0
def preProcess(column):
    """
    Do a little bit of data cleaning with the help of Unidecode and Regex.
    Things like casing, extra spaces, quotes and new lines can be ignored.
    """
    import unidecode
    #column = column.decode("utf8")
    column = unidecode.unidecode(column)
    column = re.sub('  +', ' ', column)
    column = re.sub('\n', ' ', column)
    column = column.strip().strip('"').strip("'").lower().strip()
    return column
示例#10
0
def transform_view():
    file = request.files['data_file']
    if not file:
        return "No file"

    file_contents = file.stream.read().decode("utf-8")
    file_contents = file_contents.replace('\r', '')
    file_contents = file_contents.split('\n')
    cols = file_contents[0].split(',')
    #print(list(reader(file_contents[1:]))[0])

    df_data = list(reader([unidecode.unidecode(x) for x in file_contents[1:]]))
    df = pd.DataFrame(df_data, columns = cols)
    result = transform(df)
    response = jsonify(result)
    #response = make_response(result)
    response.headers["Content-Disposition"] = "attachment; filename=odsc_east.json"
    return response
示例#11
0
def convert_to_eval_rec(r):
    rec = {}
    if r:
        address = r.get('locations', [{}])[0].get('address1', '') + r.get(
            'locations', [{}])[0].get('address2', '')
        digits = addr_reg.match(address)
        if digits:
            rec['address_number'] = digits.groups()[0]
            rec['address'] = address[len(rec['address_number']):]
        else:
            rec['address_number'] = ''
            rec['address'] = address

        rec['applicationname'] = r.get('source', {}).get('applicationName', '')
        rec['country'] = r.get('locations', [{}])[0].get('countryCode', '')

        email = r.get('locations', [{}])[0].get('email', '').split('@')
        if len(email) >= 2:
            rec['emailname'] = email[0]
            rec['emaildomain'] = email[1]
        else:
            rec['emailname'] = email[0]
            rec['emaildomain'] = ''

        rec['fax'] = r.get('locations', [{}])[0].get('fax', '')
        rec['locality'] = r.get('locations', [{}])[0].get('city', '')
        rec['name'] = r.get('name', '').strip()
        rec['contact_name'] = r.get('contact_name', '').strip()
        rec['original_xoid'] = str(r.get('source', {}).get('id', ''))
        rec['postcode'] = r.get('locations', [{}])[0].get('postalCode', '')
        rec['region'] = r.get('locations', [{}])[0].get('state', '')
        rec['status'] = ''
        rec['tel'] = r.get('locations', [{}])[0].get('phone', '')
        rec['website'] = r.get('website', '')
        rec['xoid'] = str(r.get('source', {}).get('id', ''))

    return json.loads(unidecode.unidecode(json.dumps(rec)))
示例#12
0
文件: STT.py 项目: fixcer/IT5006
        except sr.UnknownValueError:
            print('Google Speech Recognition could not understand audio')
            # playSound()
            # arduino.emit("WARNING")
        except sr.RequestError as e:
            print(
                'Could not request results from Google Speech Recognition service'
            )
            # playSound()
            # arduino.emit("WARNING")
        except Exception as e:
            print(e)
            # playSound()
            # arduino.emit("WARNING")

        removeSpeech(FILE)

    try:
        text = unidecode(text)
    except:
        text = ''

    return text


if __name__ == '__main__':
    text = unidecode.unidecode(speechRecognition())
    print("text: ", text)
    print('Do somethings ...')
示例#13
0
	def remove_acento(self, texto):
		try:
			return unidecode.unidecode(texto)
		except:
			return normalize('NFKD', texto.encode('iso-8859-1').decode('iso-8859-1')).encode('ASCII','ignore').decode()
示例#14
0
def remove_special_chars(text):
    return unidecode.unidecode(text)