def remove_punctuations_and_expressions(text_non_char): normalize_punctuations = ' '.join([ word.lstrip(punctuation.replace(".", "")).rstrip(punctuation).strip() for word in text_non_char.split() if word not in punctuation ]) normalize_punctuations = ' '.join([ re.sub("[{" + punctuation.replace("-", "").replace(".", "") + "}]", " ", word) for word in normalize_punctuations.split() ]) return normalize_punctuations
def load_dataset(path_xml): """Loads dataset into memory from xml file""" def get_label(c, length): s = "" for _ in range(length): s += c return s DOMTree = xml.dom.minidom.parse(path_xml) collection = DOMTree.documentElement sents = collection.getElementsByTagName("sentence") LABEL = ['B', 'I'] dataset = [] for sent in sents: text = sent.getElementsByTagName( 'text')[0].childNodes[0].data # type: str text = text.lower() tags_text = text aspects = sent.getElementsByTagName('aspectTerm') for aspect in aspects: term = aspect.getAttribute('term') # type: str start = int(aspect.getAttribute('from')) end = int(aspect.getAttribute('to')) tokens = term.split() ttags = [ get_label('B', len(tokens[i])) if i == 0 else get_label( 'I', len(tokens[i])) for i in range(len(tokens)) ] tags_str = " ".join(ttags) tags_text = tags_text[:start] + tags_str + tags_text[end:] tags_text = "".join(c for c in tags_text if c not in punctuation.replace('\'', '')) tags = tags_text.split() tags = [tag[0] if tag[0] in LABEL else 'O' for tag in tags] text = "".join(c for c in text if c not in punctuation.replace('\'', '')) words = text.split() assert len(tags) == len(words) dataset.append((words, tags)) return dataset
def slugify(string): ''' Slugify a string. For example "Hello world!" becomes "hello-world" - Punctuation is removed. - White spaces are replaced by a hyphen. - Letters are lowercased. Args: string (str): The string to slugify. Returns: dict: The latest geojson file. ''' # Lowercase string = string.lower() # Convert to ASCII characters string = unidecode(string) # Remove punctuation except for hyphens puncs = punctuation.replace('-', '') for punc in puncs: string = string.replace(punc, '') # Replace spaces with a single hyphen string = re.sub(r'\s+', '-', string) # Replace multiple hyphens with a single hyphen string = re.sub(r'\-+', '-', string) # Remove trailing hyphens string = string.strip('-') return string
def myAtoi(self, str): """ :type str: str :rtype: int """ if not str: return None start = 0 for i, char in enumerate(str): if char != ' ': start = i break if start == len(str)-1: return None else: str = str[start:] not_valid = punctuation.replace('+','-').replace('-','') if str[0] in not_valid: return None for i, char in enumerate(str): if char[0]
def get_ipa(word): transcription = search_ipa(word) # If not found, remove punctuation (except hyphens) and try again. if (not transcription) and (any(char in word for char in punctuation)): no_punc = re.sub("[%s]" % punctuation.replace("-", ""), "", word) transcription = search_ipa(no_punc) # If not found, collapse a hyphenated word into one and try again if (not transcription) and ("-" in no_punc): no_punc_one_word = no_punc.replace("-", "") transcription = search_ipa(no_punc_one_word) # If not found, try it with a space instead of hyphen. if not transcription: no_punc_with_space = no_punc.replace("-", " ") transcription = search_ipa(no_punc_with_space) # If not found, split where the hyphens are and look up the words separately. if not transcription: word_split = no_punc.split("-") transcription = [] for item in word_split: ipa = search_ipa(item) # Add each part to the transcription. First chop each off at the comma if any. if ipa[0].find(",") != -1: transcription.append(ipa[0][0:ipa[0].index(",")] + " /") else: transcription.append(ipa[0]) transcription = ["".join(transcription)] return transcription
def Categories(category=None): categories = User.getCategories() if category is None: # print(categories) for cat in categories: image = cat.get('image') cat.update({'image': compressImage(image)}) print(categories[0].get('category')) return render_template('category.html', categories=categories, enumerate=enumerate) if any(char in punctuation.replace('-', '') for char in category): flash('Invalid Category!') print(category) return redirect('/categories', 302) cats = User.getCategories(category) if cats: imgs = [i for i in cats] else: flash('Category not found!') return redirect('/') if imgs: return render_template('imgByCat.html', category=category) else: flash('Category not Found!') return redirect('/categories', 302)
def row_clean(row): # Remove HTML special entities (e.g. &) row_no_special_entities = re.sub(r'\&\w*;', '', row) # Remove tickers (Clickable stock market symbols that work like hashtags and start with dollar signs instead) row_no_tickers = re.sub(r'\$\w*', '', row_no_special_entities) # Substitute. $ needs to be escaped because it means something in regex. \w means alphanumeric char or underscore. # Remove hyperlinks row_no_hyperlinks = re.sub(r'https?:\/\/.*\/\w*', '', row_no_tickers) # Remove hashtags row_no_hashtags = re.sub(r'#\w*', '', row_no_hyperlinks) # Remove Punctuation and split 's, 't, 've with a space for filter row_no_punctuation = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', row_no_hashtags) # Remove words with 2 or fewer letters (Also takes care of RT) row_no_small_words = re.sub(r'\b\w{1,2}\b', '', row_no_punctuation) # \b represents a word boundary # Remove whitespace (including new line characters) row_no_whitespace = re.sub(r'\s\s+', ' ', row_no_small_words) row_no_whitespace = row_no_whitespace.lstrip(' ') # Remove single space left on the left # Remove • row_no_ball = re.sub(r'•', ' ', row_no_whitespace) # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode: row_no_emojis = ''.join(c for c in row_no_ball if c <= '\uFFFF') # Apart from emojis (plane 1), this also removes historic scripts and mathematical alphanumerics (also plane 1), ideographs (plane 2) and more. # Tokenize: Reduce length and remove handles tknzr = TweetTokenizer(preserve_case=True, reduce_len=True, strip_handles=True) # reduce_len changes, for example, waaaaaayyyy to waaayyy. tw_list = tknzr.tokenize(row_no_emojis) # Remove stopwords list_no_stopwords = [i for i in tw_list if i not in cache_english_stopwords] # Final filtered row row_filtered = ' '.join(list_no_stopwords) # ''.join() would join without spaces between words. return row_filtered
def processTweet(tweet): # Remove HTML special entities (e.g. &) tweet = re.sub(r'\&\w*;', '', tweet) #Convert @username to AT_USER tweet = re.sub('@[^\s]+','',tweet) # Remove tickers tweet = re.sub(r'\$\w*', '', tweet) # To lowercase tweet = tweet.lower() # Remove hyperlinks tweet = re.sub(r'https?:\/\/.*\/\w*', '', tweet) # Remove hashtags tweet = re.sub(r'#\w*', '', tweet) # Remove Punctuation and split 's, 't, 've with a space for filter tweet = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', tweet) # Remove words with 2 or fewer letters tweet = re.sub(r'\b\w{1,2}\b', '', tweet) # Remove whitespace (including new line characters) tweet = re.sub(r'\s\s+', ' ', tweet) tweet = re.sub(r'أ ب ت ث ج ح خ د ذ ر ز س ش ص ض ط ظ ع غ ف ق ك ل م ن هـ و ي', ' ', tweet) # Remove single space remaining at the front of the tweet. tweet = tweet.lstrip(' ') # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode: tweet = ''.join(c for c in tweet if c <= '\uFFFF') return tweet
def convert_to_file_name(self, text): sentence = str(text).replace(" ", "_") my_punctuation = punctuation.replace("_", "") sentence = (sentence.translate(str.maketrans("", "", my_punctuation))).lower() file_name = sentence + self.sound_file_extension return (file_name)
def clean_lyrics(lyrics, song_per_line): # punctuation string: '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' punctuation_regex = re.compile('[{}]'.format(punctuation.replace("'", ""))) lyrics = punctuation_regex.sub(' ', lyrics) lyrics = re.sub(r"''+", " ", lyrics) lyrics = re.sub(r"'", "", lyrics) lyrics = re.sub( r"\\uFFFD", '_', lyrics) #replaces unicode unknown characters with underscores lyrics = re.sub('\x00', "", lyrics) #remove NUL bytes lyrics = lyrics.lower() lyrics = re.sub(r"\\r", ' ', lyrics) #remove line ending symbols lyrics = re.sub(r"\\n", ' ', lyrics) #remove line ending symbols lyrics = re.sub(r"[^a-z]_+[^a-z]", ' ', lyrics) #remove all underscores not preceded by a letter lyrics = re.sub(r"\\", ' ', lyrics) #remove backslashes lyrics = re.sub(' +', ' ', lyrics) #remove surplus spaces lyrics = replace_numbers(lyrics) lyrics = re.sub('\r', '\n', lyrics) #to be on the safe side of things. lyrics = re.sub('\n +', '\n', lyrics) lyrics = re.sub("\n\n+", '\n', lyrics) #remove empty lines if song_per_line: # lyrics = re.sub('\n', ' ', lyrics) #remove all brakes between lines within one song lyrics = re.sub('\n', '.\n', lyrics) #replace all breaks with breaks with dots lyrics = re.sub(' +', ' ', lyrics) #remove surplus spaces lyrics = re.sub(' \.', '.', lyrics) #remove surplus spaces return lyrics.strip() #remove spare start end spaces
def __init__(self): self.stopwords = list( map( lambda x: x.strip(), open("./data/ro_stopwords.txt", "r", encoding="utf-8").readlines())) self.punctuation = punctuation.replace("$", "") + "0123456789"
def add_sample(self, sample): if not isinstance(sample, str): raise TypeError # Calling add_sample should replace existing sample. # To avoid appending new values onto existing lists: self.sample = sample self.misspelled_words = [] self.tokenized_sample = [] self.tagged_sample = {} sample = sample.replace('\n', " ") sample = sample.rstrip(" ") for char in punctuation.replace("'", ""): sample = sample.replace(char, "") tokens = word_tokenize(sample) for word in tokens: if word.lower() in words.words(): self.tokenized_sample.append(word) elif word.capitalize() in names.words(): continue elif "'" in word: self.tokenized_sample.append(word) elif LEMMATIZER.lemmatize(word.lower()) not in words.words(): if STEMMER.stem(word.lower()) not in words.words(): self.misspelled_words.append(word) else: self.tokenized_sample.append(word) self.tagged_sample = pos_tag(tokens)
def render(self, context): from string import punctuation from django.template.defaultfilters import dictsortreversed try: removelist = ["a", "an", "as", "at", "but", "by", "for", "from", "is", "in", "into", "of", "off", "on", "onto", "per", "since", "than", "the", "this", "that", "to", "up", "via", "with", "and", "it", "be", "was", "i","you","me","my","is","so", "some","it's","its","are","if","some","there", "what","just", ""] cat_statuses = "" for status in Status.objects.all(): cat_statuses += status.body + " " wordlist = cat_statuses.split() punctuation = punctuation.replace('@', '') wordlist = [word.strip(punctuation).lower() for word in wordlist] wordfreq = [wordlist.count(p) for p in wordlist] dictionary = dict(zip(wordlist,wordfreq)) word_dict_list = [] for key in dictionary: if key not in removelist and not key.startswith('@'): word_dict_list.append({ 'name': key, 'count': dictionary[key] }) context[self.varname] = dictsortreversed(word_dict_list, 'count')[:int(self.num)] except: pass return ''
def cleanTextFiles (data_folder, text_files = [], stop_words = {}): dictionary_list = [] #loop through each text file, read and clean for i in range(0, len(text_files)): file_path = os.path.join(data_folder, text_files[i]) file_open = open(file_path, encoding="utf-8-sig").read() #remove punctuation leave apostrophes, convert to lowercase and split my_punctuation = punctuation.replace("'", "") text = file_open.translate(str.maketrans("", "", my_punctuation)) file_strings = text.lower().split() #clean file_strings from stop words cleaner = TextCleaner() updated_file_strings = cleaner.compareRemove(stop_words, file_strings) #lemmatize updated_file_strings lemmtzr = nltk.stem.wordnet.WordNetLemmatizer() lemmas = [lemmtzr.lemmatize(token) for token in updated_file_strings] #produce a dictionary with word occurrence counts counter = TextCounter() string_dictionary = counter.countElements(lemmas) dictionary_list.append(string_dictionary) return dictionary_list
def processTweet(tweet): from string import punctuation # Remove HTML special entities (e.g. &) tweet = re.sub(r'\&\w*;', '', tweet) #Convert @username to AT_USER tweet = re.sub(r'@[^\s]+', '', tweet) # remove numbers tweet = re.sub(r'\d+', ' ', tweet) tweet = re.sub(r'([a-z])([A-Z])', '\\1 \\2', tweet) # Remove tickers tweet = re.sub(r'\$\w*', '', tweet) # To lowercase tweet = tweet.lower() # Remove hyperlinks tweet = re.sub(r'https:\/\/t.co\/.{9}', '', tweet) # Remove hashtags tweet = re.sub(r'#', ' ', tweet) # Remove Punctuation and split 's, 't, 've with a space for filter tweet = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', tweet) tweet = re.sub(r'[^\w\s]', ' ', tweet) # Remove words with 2 or fewer letters tweet = re.sub(r'\b\w{1,2}\b', '', tweet) # Remove whitespace (including new line characters) tweet = re.sub(r'\s\s+', ' ', tweet) # Remove single space remaining at the front of the tweet. tweet = tweet.lstrip(' ') # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode: tweet = ''.join(c for c in tweet if c <= '\uFFFF') return tweet
def config_entries(dir=os.getcwd()): """ retrieve information from config file if there's no config.ini in provided directory: copy config.template.ini generate secret key if none is found in config.ini """ config = configparser.ConfigParser() if 'config.ini' not in os.listdir(dir): # copy template file if config.ini doesn't exist print('creating config file') copyfile(dir + '/config.template.ini', dir + '/config.ini') config.read(dir + '/config.ini') host = config['server']['host'] port = config['server']['port'] try: # run exception if secret-key is not found or empty s_key = config['server']['secret-key'] if len(s_key) == 0: raise Exception('invalid secret key') except: print('generating secret key') # generate secret key (16 characters) chars = ascii_letters + digits + punctuation.replace('%', '') s_key = ''.join(choices(chars, k=16)) # write secret key to config file config['server']['secret-key'] = s_key with open('config.ini', 'w') as configfile: config.write(configfile) return {'secret-key': s_key, 'host': host, 'port': port}
def __init__(self, index: 'Index', or_operator: str = 'OR', and_operator: str = 'AND', difference_operator: str = 'NOT', punctuation: str = None): ''' Parameters: ------------ sets: a dictionary where keys correspond to query tokens, and values are sets containing CordDoc objects ''' self.index = index self.token_to_set = self.index.docmap self.or_operator = or_operator self.and_operator = and_operator self.difference_operator = difference_operator self.operators = { self.or_operator, self.and_operator, self.difference_operator } self.punctuation = punctuation if self.punctuation is None: self.punctuation = PUNCTUATION.replace('(', '').replace(')', '')
def from_dict(self, d): """ Create a Pipeline from a dictionary. The change is in inplace. :argument: python dictionary :return: None """ if 'uid' in d: if d['uid']: self._uid = d['uid'] if 'name' in d: if d['name']: invalid_symbols = punctuation.replace('.','') if not isinstance(d['name'], str): raise ree.TypeError(expected_type=str, actual_type=type(d['name'])) if any(symbol in d['name'] for symbol in invalid_symbols): raise ree.ValueError(obj=self._uid, attribute='name', actual_value=d['name'], expected_value="Valid object names can " + "contains letters, numbers and '.'. Any " "other character is not allowed") self._name = d['name'] if 'state' in d: if isinstance(d['state'], str) or isinstance(d['state'], str): if d['state'] in list(states._pipeline_state_values.keys()): self._state = d['state'] else: raise ree.ValueError(obj=self._uid, attribute='state', expected_value=list(states._pipeline_state_values.keys()), actual_value=d['state']) else: raise ree.TypeError(entity='state', expected_type=str, actual_type=type(d['state'])) else: self._state = states.INITIAL if 'state_history' in d: if isinstance(d['state_history'], list): self._state_history = d['state_history'] else: raise ree.TypeError(entity='state_history', expected_type=list, actual_type=type( d['state_history'])) if 'completed' in d: if isinstance(d['completed'], bool): if d['completed']: self._completed_flag.set() else: raise ree.TypeError(entity='completed', expected_type=bool, actual_type=type(d['completed']))
def repl(x): to_join = punctuation.replace('-', '').replace('_', '').replace( '.', '').replace('@', '').split() for p in " ".join(to_join): x = x.replace(p, ' ') return (x.strip())
def count_words(sentence): # need to remove ' as it is one chars we specifically want to keep inside a word for char in punctuation.replace("'", ""): sentence = sentence.lower().replace(char, " ") split_words = [item.strip("'") for item in sentence.split()] return {item: split_words.count(item) for item in split_words}
def __init__(self): # List of symbols we don't care about self.SYMBOLS = " ".join(punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"] # tools for splitting text # TODO(krzum) add support for this in py3 self.punctuation = punctuation.replace("'",'').replace('"','') self.trans = maketrans(punctuation, ' '*len(punctuation))
def tokenize(doc, keep_internal_punct=False): """ Tokenize a string. The string should be converted to lowercase. If keep_internal_punct is False, then return only the alphanumerics (letters, numbers and underscore). If keep_internal_punct is True, then also retain punctuation that is inside of a word. E.g., in the example below, the token "isn't" is maintained when keep_internal_punct=True; otherwise, it is split into "isn" and "t" tokens. Params: doc....a string. keep_internal_punct...see above Returns: a numpy array containing the resulting tokens. >>> tokenize(" Hi there! Isn't this fun?", keep_internal_punct=False) array(['hi', 'there', 'isn', 't', 'this', 'fun'], dtype='<U5') >>> tokenize("Hi there! Isn't this fun? ", keep_internal_punct=True) array(['hi', 'there', "isn't", 'this', 'fun'], dtype='<U5') """ ###TODO doc = doc.lower() replaceunderscore = punctuation.replace("_", "") if (keep_internal_punct == True): token = ' '.join( filter(None, (divide.strip(replaceunderscore) for divide in doc.split()))) token = re.sub(r'\s+', " ", token).split() return np.array(token, dtype="unicode") elif (keep_internal_punct == False): token = re.sub(r"[^\w]", " ", doc).split() return np.array(token, dtype="unicode") pass
def text_reg(word): remove = punctuation.replace('-', '').replace('<', '').replace('>', '') word = word.lower().replace("'s", "").replace("n't", "").replace('/n', '').replace('/v', '') word = word.translate(str.maketrans('', '', remove)) if word in ('don','hasn','hadn','shouldn','couldn','wouldn','shan','weren', 'didn','haven','isn','needn','aren','mustn','doesn','mightn','wasn','ain'): word = word.rstrip('n') return word
def text_reg(word): remove = punctuation.replace('-', '').replace('<', '').replace('>', '') #remove = punctuation.replace('<', '').replace('>', '') word = word.lower().replace("'s", "").replace("n't", "").replace( '/n', '').replace('/v', '') word = word.translate(str.maketrans('', '', remove)) word = word.replace('-', ' ') word = word.replace('—', ' ') return word
def tokenise(review): # punctuation string: '!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~' punctuation_regex = re.compile('[{}]'.format(punctuation.replace("'", ""))) review = review.strip() review = punctuation_regex.sub(' ', review) review = re.sub(r"'+", "", review) tokens = review.lower().split() tokens = ['6'*len(token) if token.isdigit() else token for token in tokens if token not in stopwords] return tokens
def valid_selection(start, end): from string import punctuation, whitespace punctuation = punctuation.replace("_", "") valid_characters = punctuation + whitespace if not (end.get_char() in valid_characters): return False if start.starts_line(): return True start.backward_char() if not (start.get_char() in valid_characters): return False return True
def strong_password(size=10): nbig = size/5 nsmall = size/3 v = ([choice(ascii_uppercase) for x in xrange( nbig)] + [choice(ascii_lowercase) for x in xrange(nsmall)] + [choice(digits) for x in xrange(size/2)] + [choice(punctuation.replace('|','-')) for x in xrange(size/5)] ) shuffle(v) return v[:size]
def __init__(self): self.stop_words = stopwords.words('english') self.stop_words.extend([ "rt", "n't", "'re", "gon", "na", "covid", "coronavirus", "covid-19" ]) self.punctuation_to_remove = punctuation.replace('#', '').replace( '@', '').replace('%', '').replace('$', '') self.symbols = "<>:\"/\\|!?*~.'`-_()^,+=;" self.token_stemmer = stemmer.Stemmer()
def snake_case(string): 'Convert a string to snake_case.' string = re.sub('(.)([A-Z][a-z])', r'\1_\2', string) # *Aa -> *_Aa string = re.sub('(.)([0-9]+)', r'\1_\2', string) # *00 -> *_00 string = string.replace('-', '_') # A-B -> A_B string = ''.join(c for c in string if c not in punctuation.replace('_', '')) string = string.replace(' ', '_').strip('_').lower() # _A B -> a_b string = re.sub('_+', '_', string) # a__b -> a_b return string
def build_vocab(frame): p = punct.replace('.', '') d = dict.fromkeys(p, ' ') tr = str.maketrans(d) text = frame.body.explode() text = str_app(text, 'translate', tr) text = str_app(text, 'replace', '\s+', ' ', regex=True) text = str_app(text, 'lower') text = str_app(text, 'split').explode() return text
def tweet_clean(tweet): #print('Original tweet:', tweet, '\n') # Remove HTML special entities (e.g. &) tweet_no_special_entities = re.sub(r'\&\w*;', '', tweet) tweet_no_special_entities = re.sub(r'\@\w*;', '', tweet) #print('No special entitites:', tweet_no_special_entities, '\n') # Remove tickers tweet_no_tickers = re.sub(r'\$\w*', '', tweet_no_special_entities) #print('No tickers:', tweet_no_tickers, '\n') # Remove hyperlinks tweet_no_hyperlinks = re.sub(r'https?:\/\/.*\/\w*', '', tweet_no_tickers) #print('No hyperlinks:', tweet_no_hyperlinks, '\n') # Remove hashtags tweet_no_hashtags = re.sub(r'#\w*', '', tweet_no_hyperlinks) #print('No hashtags:', tweet_no_hashtags, '\n') # Remove Punctuation and split 's, 't, 've with a space for filter tweet_no_punctuation = re.sub(r'[' + punctuation.replace('@', '') + ']+', ' ', tweet_no_hashtags) #print('No punctuation:', tweet_no_punctuation, '\n') # Remove https tweet_no_https = re.sub(r'https', '', tweet_no_punctuation) tweet_no_https = re.sub(r'http', '', tweet_no_punctuation) #print('No https:', tweet_no_https, '\n') # Remove words with 2 or fewer letters tweet_no_small_words = re.sub(r'\b\w{1,2}\b', '', tweet_no_https) #print('No small words:', tweet_no_small_words, '\n') # Remove whitespace (including new line characters) tweet_no_whitespace = re.sub(r'\s\s+', ' ', tweet_no_small_words) tweet_no_whitespace = tweet_no_whitespace.lstrip( ' ') # Remove single space remaining at the front of the tweet. #print('No whitespace:', tweet_no_whitespace, '\n') # Remove characters beyond Basic Multilingual Plane (BMP) of Unicode: # tweet_no_emojis = ''.join(c for c in tweet_no_whitespace if c <= '\uFFFF') # Apart from emojis (plane 1), this also removes historic scripts and mathematical alphanumerics (also plane 1), ideographs (plane 2) and more. # #print('No emojis:', tweet_no_whitespace, '\n') # Tokenize: Change to lowercase, reduce length and remove handles tknzr = TweetTokenizer( preserve_case=False, reduce_len=True, strip_handles=True ) # reduce_len changes, for example, waaaaaayyyy to waaayyy. tw_list = tknzr.tokenize(tweet_no_whitespace) #print('Tweet tokenize:', tw_list, '\n') # Remove stopwords list_no_stopwords = [i for i in tw_list if i not in english_stopwords] #print('No stop words:', list_no_stopwords, '\n') # # Final filtered tweet tweet_filtered = ' '.join(list_no_stopwords) tweet_filtered = tweet_filtered.replace(')', '') tweet_filtered = tweet_filtered.encode('ascii', 'ignore') # print 'Final tweet: '+tweet_filtered return (tweet_filtered)
def __init__(self): # prep chars to remove except single quote and comma self.charsToRemove = punctuation.replace("'", "").replace(",", "").replace( "-", "").replace(".", "") # and add some other chars to remove self.charsToRemove += "®“”" self.charsToRemoveMap = np.full((65536), False) for i in range(len(self.charsToRemove)): c = self.charsToRemove[i] self.charsToRemoveMap[ord(c)] = True
def remove_punctuations(content): content = ''.join(characters for characters in content if characters not in '!}{][)(\><=#"$%&,/*`\'') punc = punctuation.replace('-', "") content = ' '.join( token.strip(punc) for token in content.split() if token.strip(punc)) content = ' '.join( token.replace("'", "") for token in content.split() if token.replace("'", "")) content = ' '.join(content.split()) return content
def __init__(self, coordinates): self.coordinates = coordinates self.map = [] self.largest_x = 0 self.largest_y = 0 self.chars = ascii_lowercase + punctuation.replace('.', '').replace( ',', '') self.char_dict = {} self.distance_tracker = {} self.before_after_dict = {} self.nearest_char_per_coordinate = {}
def _format_name(self, name): punc = punctuation.replace('_', '') name = str(name).translate(None, punc) name = name.replace(' ', '_') if name[0].isdigit(): name = '_' + name if len(name) < 30: return name self.num_lg_names += 1 sas_name = name[:20] sas_name += '_lg_{0}'.format(self.num_lg_names) return sas_name
def __init__(self, string_in, filter_plus=None): from string import punctuation for item in list('._-@'): punctuation = punctuation.replace(item,'') for item in punctuation: string_in = string_in.replace(item, ' ') string_in = list(set(string_in.strip().split())) for items in string_in: for l in list('._-@'): items = items.strip(l) if self.isvalid(items, filter_plus): self.append(items)
def _format_name(self, name): punc = punctuation.replace('_', '') name = str(name).translate(None, punc) name = name.replace(' ', '_') words = name.split('_') for i, w in enumerate(words): if i == 0: name = w.lower() continue name += w.capitalize() if name[0].isdigit(): name = '_' + name return name
def extract_words(input_string): """ Returns a list of lowercase words in a strong. Punctuation and digits are separated out into their own words. """ for c in punctuation.replace('@', "") + digits : input_string = input_string.replace(c, "") print input_string splitted_string = input_string.lower().split() return [x for x in splitted_string if not (x.startswith("http") or x.startswith("@"))]
def parse_hook_from_text(text): hook = text.strip().replace(" ", "_").lower() hook = hook.replace("/:", ":").replace("/", ":") hook = hook.replace(":", "_") for token in punctuation.replace("_", ""): hook = hook.replace(token, "") if ( hook.startswith("_") ): hook = hook[1:] if ( hook.endswith("_") ): hook = hook[:-1] return hook
def admin_keywords_submit(request): """ Adds any new given keywords from the custom keywords field in the admin, and returns their IDs for use when saving a model with a keywords field. """ keyword_ids, titles = [], [] remove = punctuation.replace("-", "") # Strip punctuation, allow dashes. for title in request.POST.get("text_keywords", "").split(","): title = "".join([c for c in title if c not in remove]).strip() if title: kw, created = Keyword.objects.get_or_create_iexact(title=title) keyword_id = str(kw.id) if keyword_id not in keyword_ids: keyword_ids.append(keyword_id) titles.append(title) return HttpResponse("%s|%s" % (",".join(keyword_ids), ", ".join(titles)))
def convert_lat_lon_strings(string): #cleans a latitude or longitude text string and converts into decimal degrees from string import punctuation for symbol in punctuation.replace('-','').replace('.',''): string = string.replace(symbol,' ') #replace punctuation (other than - and .) with space coord_list = string.split() if coord_list[-1] == 'N' or coord_list[-1] == 'S' or coord_list[-1] == 'E' or coord_list[-1] == 'W': if coord_list[-1] == "S" or coord_list[-1] == "W": #if the coordinate is in the southern or western hemisphere, the lat/lon is negative. if coord_list[0].find('-') == -1: coord_list[0] = '-' + coord_list[0] #change the hemisphere indicator to +/- coord_list.pop()#remove the hemisphere indicator coord_list = [float(coord) for coord in coord_list]#convert each element to float coordinate = convert_DMS_to_degrees(coord_list) if abs(coordinate) > 180: return 0 return coordinate
def render(self, context): from string import punctuation from django.template.defaultfilters import dictsortreversed try: cat_statuses = "" for status in Status.objects.all(): cat_statuses += status.body + " " wordlist = cat_statuses.split() punctuation = punctuation.replace('@', '') wordlist = [word.strip(punctuation).lower().replace("'s", '') for word in wordlist] wordfreq = [wordlist.count(p) for p in wordlist] dictionary = dict(zip(wordlist,wordfreq)) word_dict_list = [] for key in dictionary: if key.startswith('@'): word_dict_list.append({ 'name': key[1:], 'count': dictionary[key] }) context[self.varname] = dictsortreversed(word_dict_list, 'count')[:int(self.num)] except: pass return ''
def convert_lat_lon_strings(string): #cleans a latitude or longitude text string into decimal degrees from string import punctuation for symbol in punctuation.replace('-','').replace('.',''): string = string.replace(symbol," ") #replace punctuation (other than - and .) with space coord_list = string.split() if coord_list[-1] == 'N' or coord_list[-1] == 'S' or coord_list[-1] == 'E' or coord_list[-1] == 'W': if coord_list[-1] == "S" or coord_list[-1] == "W": #if the coordinate is in the southern or western hemisphere, the lat/lon is negative. if coord_list[0].find('-') == -1: coord_list[0] = '-' + coord_list[0] coord_list.pop()#remove the hemisphere indicator coordinate = 0 denominator = 1 for i in range(len(coord_list)): #DMS to decimal formula: deg = D + M/60 + S/3600 coordinate+=float(coord_list[i])/denominator denominator*=60 if abs(coordinate) > 180: return 0 return coordinate
import os import nltk import codecs import collections import hashlib from nltk import sent_tokenize from string import punctuation import multiprocessing as mp from multiprocessing import pool from itertools import izip_longest global del_words, punct, default_stopwords, cores, block, swear_words del_words = '@' punct = punctuation.replace('#', '').replace('\\', '') default_stopwords = nltk.corpus.stopwords.words('german') cores = 48 block = 10000 swear_words = [] def grouper(n, iterable, padvalue = None): return izip_longest(fillvalue = padvalue, *[ iter(iterable)] * n) def removeSpecialCharsWorker(line): if line: line = line.rstrip() if 'http' not in line: translated_phrase = line.encode('utf-8').translate(None, punct) words_list = [word for word in translated_phrase.split() if (not word.startswith(del_words)) and (not word.isdigit()) and (word not in default_stopwords)]
from string import punctuation from sys import argv # Create a list of all punctuation minus the hyphen punctuationList = list(punctuation.replace("-", "")) def main(): # Get the name of the script and the name of the file from the command line script, filename = argv text = open(filename) contents = text.read() text.close() # Remove newline characters and seperator hypens from the file contents, and split the # contents into a list of all the words in it wordList = contents.replace("\n", " ").replace(" - ", " ").replace("--", " ").split(" ") # Create a dictionary with each of the words and the number of times that word occurs wordDict = {} for word in wordList: # Remove any punctuation from the file contents that doesn't act as a seperator for # two different words for char in punctuationList: word = word.replace(char, "") # Don't count words that are blank strings, such as empty lines in a file if word != "": word = word.lower() if word in wordDict:
#!usr/bin/python import nltk import json import re from string import punctuation from collections import Counter #TODO: Pick best name #Globals award_specific_stopwords = ['rt', 'golden', 'globes', 'goldenglobes', 'best', 'director', 'actor', 'actress', 'movie', 'motion', 'picture', 'film', 'tv', 'series', 'performance', 'television', 'snub', 'wins', 'win', 'congrats', 'congratulations', 'season', 'animated', 'animation', 'feature', 'comedy', 'drama', 'musical', 'screenplay', 'award', 'awards'] stopwords = nltk.corpus.stopwords.words() + award_specific_stopwords names = nltk.corpus.names.words() punctuation.replace('#', '') punctuation.replace('@', '') def query_name(tweets, pattern, g=None, n=None): """ Querys the tweets of a specific year to get all sufficient ngrams. tweets is a dictionary object with the loaded json data from the tweets. pattern is a regex pattern you want to query. Pass in a compiled regex pattern for improved performance. g specifies which type of n-gram you want, i.e. g=1 for unigrams, 2 for bigrams, and 3 for trigrams. If g is not provided, the query returns all types of grams. n is the number of results you want to see. If not provided, it shows all.
def clean_filename(filename): remove_punct_map = dict.fromkeys(map(ord, punctuation.replace('_', '') + '’')) return str(filename).translate(remove_punct_map).strip().replace(' ', '_')[:100]
import db_info import MySQLdb import collections from string import punctuation db = MySQLdb.connect( host=db_info.host, # your host, usually localhost user=db_info.user, # your username passwd=db_info.passwd, # your password db=db_info.db, ) # name of the data base cur = db.cursor() # cursor object for mysql query twitter_punct = punctuation.replace("#", "") # calculate the number of tweets for each hour and write to a .csv def tweets_vs_time(cur): f = open("data/data.csv", "w") # create new csv f.write("time,count\n") for i in range(15, 23): for j in range(0, 24): dateStart = "2013-04-%02d %02d" % (i, j) print "query: select count(*) from tweets where time like '" + dateStart + "%'" cur.execute("select count(*) from tweets where time like '" + dateStart + "%'") for row in cur.fetchall(): dataset.append({"time": dateStart, "count": row}) result = (str(dateStart) + "," + "%s\n") % (row[0],) f.write(result) print "done"
""" config.py Useful definitions and regular expressions. """ from string import punctuation CORPUS_FOLDER = "data/corpus" THESAURI_FOLDER = "data/thesaurus" MAPPING_FOLDER = "data/mapping" CORP_TAG = ".txt" THES_TAG = ".thes" MAP_TAG = ".map" WORD = "\w+[\'-]?\w*" PRICE = "\$[\d.]+" PUNCTUATION_EXCEPT_HYPHEN = '[' + punctuation.replace('-', '') + ']'
from nltk.stem.porter import * from nltk.corpus import stopwords from string import punctuation stopwords_list = stopwords.words('english') stemmer = PorterStemmer() twitter_punctuation = punctuation.replace('#', '') # we want to keep hashtags def process_tweet(tweet): tweet = tweet.translate(tweet.maketrans('', '', twitter_punctuation)) # remove all punctuation except pound symbol word_list = tweet.split(' ') clean_words = [] for word in word_list: if word not in stopwords: clean_words.append(stemmer.stem(word)) return clean_words def process_tweets(tweets): dataset = [] for tweet in tweets: dataset.append(process_tweet(tweet)) return dataset
# Author: Christopher Hench # ============================================================================== '''Working product to automatically generate SRT file from script. Intervals are given as user inputs while watching video. Line breaks are determined by a tree parsing algorithm.''' import time from nltk.parse import stanford from nltk import sent_tokenize, Tree from string import punctuation punctuation = punctuation.replace("'", "") line_limit = 34 with open("script_eng.txt") as f: raw_script = f.read() # prepare stanford parser parser = stanford.StanfordParser( path_to_jar="/Users/chench/Documents/stanford-parser-full-2015-12-09/stanford-parser.jar", path_to_models_jar="/Users/chench/Documents/stanford-parser-full-2015-12-09/stanford-parser-3.6.0-models.jar") sentences = parser.raw_parse_sents(sent_tokenize(raw_script)) def get_all_nodes(parent): ''' extracts all chunk and word relations ''' for node in parent:
ap_mac_pattern = re.compile('.*Address: (.*?)\n', re.I | re.M | re.S) channel_pattern = re.compile('.*Channel:? ?(\d\d?)', re.I | re.M | re.S) strength_pattern = re.compile('.*Quality:?=? ?(\d+)\s*/?\s*(\d*)', re.I | re.M | re.S) altstrength_pattern = re.compile('.*Signal level:?=? ?(\d\d*)', re.I | re.M | re.S) signaldbm_pattern = re.compile('.*Signal level:?=? ?(-\d\d*)', re.I | re.M | re.S) mode_pattern = re.compile('.*Mode:(.*?)\n', re.I | re.M | re.S) freq_pattern = re.compile('.*Frequency:(.*?)\n', re.I | re.M | re.S) ip_pattern = re.compile(r'inet [Aa]d?dr[^.]*:([^.]*\.[^.]*\.[^.]*\.[0-9]*)', re.S) bssid_pattern = re.compile('.*Access Point: (([0-9A-Z]{2}:){5}[0-9A-Z]{2})', re.I | re.M | re.S) wep_pattern = re.compile('.*Encryption key:(.*?)\n', re.I | re.M | re.S) altwpa_pattern = re.compile('(wpa_ie)', re.I | re.M | re.S) wpa1_pattern = re.compile('(WPA Version 1)', re.I | re.M | re.S) wpa2_pattern = re.compile('(WPA2)', re.I | re.M | re.S) auth_pattern = re.compile('.*wpa_state=(.*?)\n', re.I | re.M | re.S) RALINK_DRIVER = 'ralink legacy' blacklist_strict = punctuation.replace("-", "") + " " blacklist_norm = ";`$!*|><&\\" blank_trans = maketrans("", "") def _sanitize_string(string): if string: return translate(str(string), blank_trans, blacklist_norm) else: return string def _sanitize_string_strict(string): if string: return translate(str(string), blank_trans, blacklist_strict) else: return string def SetDNS(dns1=None, dns2=None, dns3=None): """ Set the DNS of the system to the specified DNS servers. Opens up resolv.conf and writes in the nameservers.
EncryptLite( sTest ) = 'Y=REjqRp=KurB=cVRbgIelZhuS{bOMhDG=t==nW==ABqX' XOREncrypt( sTest ) = '3c0d094c1e1d0c0f074f0a17031b01480303144f0210' + '011c0a0c45031a0a1a4518040a48090d16164801030b41' getRot13( sTest ) = 'Gur dhvpx oebja sbk whzcrq bire gur ynml qbt.' ''' from string import punctuation, digits from Collect.Cards import ShuffleAndCut from String.Replace import getTextReversed from String.Transform import TranslatorFactory sSafe = punctuation.replace( '\\', ' ' ) + digits changePunct = TranslatorFactory( sSafe, getTextReversed( sSafe ) ) def DescendChars( sOrig, iOffset = False, bStepIncrement = False, bBackStep = False, bBackwards = False ): # from Iter.AllVers import lMap, iRange # def getIncrement( i ): return 0 #