def myTokenizer(self, txt): """ Constructing a tokenizer based on regular expressions @params txt: the words of the text as a string @rtype: {List} """ return re.findall(r"\w+(?:[-']\w+)*|'|[-.(]+|\S\w*", txt)
def findCurrency(text): """ Display information about the found strings """ symbols = "$£eurospoundsdollars" # Iterate through each item in text and find all strings matching a regular # expression to find all amounts of money for i in text: matches = re.findall('((?:(?:\$|£)(?:\d+)(?:\.?\d*,?\d{1,3})(?:bn|m)?)|'\ '(?:(?:\d+)(?:\.?,?\d)*(?:bn|m)?(?: ?euros?| ?dollars?| ?pounds?| ?p)))',\ i, re.IGNORECASE) # If a match is found, check the currency and amount, print if matches: for m in matches: if re.search('\$|dollars?', m, re.IGNORECASE): currency = "Dollar" if re.search('\£|pounds?|p', m, re.IGNORECASE): currency = "Pound" if re.search('euros?', m, re.IGNORECASE): currency = "Euro" amount = m.strip(symbols) print("Found a match!" + "\nCurrency:", currency, "\nAmount:",\ amount, "\n")
def sentence2word(inputFile,outputFile): with open(inputFile) as dataFile: sentences = dataFile.read().splitlines() rows = [] for sentence in sentences: row = [] row.append(sentence); row.append('Sentence'); rows.append(row) row = [] row.append('BOS'); row.append('BOS'); rows.append(row) # split sentence into words and punctuations words = re.findall(r"[\w']+|[().,!?;]", sentence) for word in words: row = [] row.append(word) row.append('O') rows.append(row) row = [] row.append('EOS'); row.append('EOS'); rows.append(row) with open(outputFile,'w') as w: writer = csv.writer(w) writer.writerows(rows) print('Done: sentence text to word+\'O\' csv')
def convert_emphesize(text, return_count=False): emphs = re.findall(r'\b[A-Z]{2,}\b', text) emphs = set(emphs) if return_count: return len(emphs) for emph_ in emphs: text = re.sub(r'\b' + emph_ + r'\b', emph_ + ' emphh', text) return text
def strip_url(text, return_count=False): urls = re.findall('http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text) if return_count: return len(urls) for url in urls: text = text.replace(url, '_URL_') text = text.replace('https:', '') return text
def is_long_number(text, threshold=1, flag_res=False): numbers_lens = re.findall('\\d+', text) if numbers_lens and len(max(numbers_lens, key=len)) >= threshold: if flag_res: return len(max(numbers_lens, key=len)) return text + ' _longnumber_' if flag_res: return 0 return text
def tokenize(str): # remove punctuation tokens = re.findall(r"<a.*?/a>|<[^\>]*>|[\w'@#]+", str.lower()) # lemmatize words. try both noun and verb lemmatizations lmtzr = WordNetLemmatizer() for i in range(0, len(tokens)): res = lmtzr.lemmatize(tokens[i]) if res == tokens[i]: tokens[i] = lmtzr.lemmatize(tokens[i], 'v') else: tokens[i] = res return tokens
def get_score(self, text): if self.language == 'es': sentences = len(self.sentence_tokenizer.tokenize(text)) clean_text = self.clean_text(text) words = len(clean_text.split()) syllables = len(re.findall('[aeiou]', clean_text)) score = 206.835 - (62.3 * (syllables / words)) - (words / sentences) return score elif self.language == 'en': total_words = len(text.split()) characters = len("".join(self.clean_text(text).split())) sentences = len(self.sentence_tokenizer.tokenize(text)) score = round((4.71 * (characters / total_words)) + (0.5 * (total_words / sentences)) - 21.43) return score
def stemming(string, top): string_rep = re.sub('[^a-zA-Z]', ' ', string) string_rep = string_rep.lower() new_vocab = word_tokenize(string_rep) #inter_stem=[ps.stem(l) for l in new_vocab] vocab_stem = [ ps.stem(w) for w in new_vocab if w not in set(stopwords.words('english')) ] dictry_stem = set(vocab_stem) data = ' '.join(vocab_stem) arr = [] c = 0 t = 0 for i in dictry_stem: c = len(re.findall(i, data)) t = t + c arr.append([i, c]) #print(i,c) aux = [] for q, w in arr: if w > 1: aux.append([q, w]) aux.sort(key=lambda i: i[1], reverse=True) aux = [w for w in aux if w[1] > 1] x = [] y = [] for wrd, co in aux: x.append(wrd) y.append(co) print("total number of words:", t, "\t set of words:", len(set(dictry_stem))) #print("Top ",top," words occurring more than twice: ",aux) print("%age of total length these word account for:", (sum(y) / t) * 100) #plt.bar(x,y,color='red',alpha=0.8) #plt.xlabel('Word') #plt.ylabel('Frequency') #plt.title('Words occurring more than twice') return (x, string_rep)
s2 = '</s>' Know = vocabRaw_tokens_nopunct[0] + vocabRaw_tokens_nopunct[ 1] + vocabRaw_tokens_nopunct[2] vocabRaw_tokens_nopunct.append("[^" + Know + "]") vocabRaw_tokens_nopunct.append(s1) Px_a = [0.0, 0.0, 0.0, 0.0, 0.0] Px_b = [0.0, 0.0, 0.0, 0.0, 0.0] Px_c = [0.0, 0.0, 0.0, 0.0, 0.0] Px_UNK = [0.0, 0.0, 0.0, 0.0, 0.0] Px_s = [0.0, 0.0, 0.0, 0.0, 0.0] Ps_x = [0.0, 0.0, 0.0, 0.0, 0.0] for i in range(0, 5): Px_a[i] = len( re.findall( s1 + ".*" + vocabRaw_tokens_nopunct[0] + " " + vocabRaw_tokens_nopunct[i] + ".*" + s2, dataRaw)) / len( re.findall(vocabRaw_tokens_nopunct[0], dataRaw)) Px_b[i] = len( re.findall( s1 + ".*" + vocabRaw_tokens_nopunct[1] + " " + vocabRaw_tokens_nopunct[i] + ".*" + s2, dataRaw)) / len( re.findall(vocabRaw_tokens_nopunct[1], dataRaw)) Px_c[i] = len( re.findall( s1 + ".*" + vocabRaw_tokens_nopunct[2] + " " + vocabRaw_tokens_nopunct[i] + ".*" + s2, dataRaw)) / len( re.findall(vocabRaw_tokens_nopunct[2], dataRaw)) Px_UNK[i] = len( re.findall( s1 + ".*" + vocabRaw_tokens_nopunct[3] + " " +
def horus_to_features(horusfile, le): print horusfile features, sentence_shape = [], [] targets, tokens_shape, y_sentences_shape, y_tokens_shape = [], [], [], [] df = pd.read_csv(horusfile, delimiter=",", skiprows=1, header=None, keep_default_na=False, na_values=['_|_']) oldsentid = df.get_values()[0][1] for index, linha in df.iterrows(): if len(linha) > 0: if linha[7] == 0: #no compounds if linha[1] != oldsentid: sentence_shape.append(features) y_sentences_shape.append(targets) targets, features = [], [] idsent = linha[1] idtoken = linha[2] pos_bef = '' pos_aft = '' if index > 0 and df.get_value(index - 1, 7) == 0: pos_bef = df.get_value(index - 1, 5) if index + 1 < len(df) and df.get_value(index + 1, 7) == 0: pos_aft = df.get_value(index + 1, 5) token = linha[3] postag = linha[5] one_char_token = len(token) == 1 special_char = len( re.findall('(http://\S+|\S*[^\w\s]\S*)', token)) > 0 first_capitalized = token[0].isupper() capitalized = token.isupper() title = token.istitle() digit = token.isdigit() stop_words = token in stop small = True if len(horusfile[3]) <= 2 else False stemmer_lanc = lancaster_stemmer.stem(token) nr_images_returned = linha[17] nr_websites_returned = linha[25] hyphen = '-' in token cv_loc = float(linha[12]) cv_org = float(linha[13]) cv_per = float(linha[14]) cv_dist = float(linha[15]) cv_plc = float(linha[16]) tx_loc = float(linha[20]) tx_org = float(linha[21]) tx_per = float(linha[22]) tx_err = float(linha[23]) tx_dist = float(linha[24]) if linha[6] in definitions.NER_TAGS_LOC: ner = u'LOC' elif linha[6] in definitions.NER_TAGS_ORG: ner = u'ORG' elif linha[6] in definitions.NER_TAGS_PER: ner = u'PER' else: ner = u'O' #standard shape sel_features = [ idsent, idtoken, token, token.lower(), stemmer_lanc, pos_bef, postag, pos_aft, definitions.KLASSES2[ner], le.transform(pos_bef), le.transform(postag), le.transform(pos_aft), title, digit, one_char_token, special_char, first_capitalized, hyphen, capitalized, stop_words, small, nr_images_returned, nr_websites_returned, cv_org, cv_loc, cv_per, cv_dist, cv_plc, tx_org, tx_loc, tx_per, tx_dist, tx_err ] features.append(sel_features) if linha[51] in definitions.NER_TAGS_LOC: y = u'LOC' elif linha[51] in definitions.NER_TAGS_ORG: y = u'ORG' elif linha[51] in definitions.NER_TAGS_PER: y = u'PER' else: y = u'O' targets.append(y) tokens_shape.append(sel_features[9:len(sel_features)]) y_tokens_shape.append(definitions.KLASSES2[y]) oldsentid = linha[1] print 'total of sentences', len(sentence_shape) print 'total of tokens', len(tokens_shape) return sentence_shape, y_sentences_shape, tokens_shape, y_tokens_shape
def extract_named_entity(named_entities_tree, entity_type): return map( lambda result: ' '.join( map(lambda inner_result: inner_result.split('/')[0], result[5:] [:-1].split())), re.findall('\({}.*\)'.format(entity_type), named_entities_tree))
# 100 and returned to string format. After the validation the output is provided saying a match is found, # what the match is, the currency type and the quanity. #----------------------------------------------------------------------------------------------------------- import urllib import nltk from nltk import re # Prints presentation text. print("=" * 80 + "\n PART 2 : REGULAR EXPRESSIONS, FSAs and FSTs \n" + "=" * 80) # Add the website source and scrape the contents of the <p> tags. print("Loading website 'http://www.bbc.co.uk/news/business-41779341'...") website = urllib.urlopen('http://www.bbc.co.uk/news/business-41779341').read() print("Finding text between <p></p> tags...") pTagText = re.findall('<p>(.*?)</p>',website, flags=re.I) # The regular expression. regular_expression = '(?:([€$£])((?:\d\d{0,2})(?:,\d{3})*(?:\.\d+)?(?:k|mn|bn|tn)?))|(?:((?:\d\d{0,2})(?:,\d{3})*(?:\.\d+)?(?:k|mn|bn|tn)?)( ?pence|p| ?euros?| ?dollars?| ?pounds?))' # Iterates through the text gained from the <p> tags and applies the regular expression on them to search # for matches, which if found, are added to the results array. results = [] print("Finding currency related text from website contents...\n") for p in pTagText: results += re.findall(regular_expression, p, flags=re.I) # If no results are found print so. if(len(results) == 0): print("No matches found.")
import nltk from nltk import word_tokenize, re, pos_tag import nltk from nltk import re #We ask User to input whether he wants to input text or URL. choose = input( "Please let us know, if you want to search phone numbers from text or URL?" ) if choose == 'text': #If choice =text we input text from user, and store the Regular Expression pattern in reg text = input("Please Enter a text:") reg = '(((^)|(\s))((\+\d{2}\s\d{2}\s{0,1}\d{8})|([1-9]\d{3}\s\d{6})|((0)((0\d{2}\s\d{2}\s{0,1}\d{8})|([1-9]\d{3}\s{0,1}\d{6})|(\d{2}\s\d{2}\s{0,1}\d{8})))))' phone = re.findall(reg, text) print(text) instate = 'q0' if not phone: print("Sorry No MatchesFound") else: for i in range(len(phone)): print("Match Found:", phone[i][0], "at:", i) #Getting PhoneNumbers from URL: #If user wishes to enter URL, we ask user to enter URL elif choose == "URL": url = input("Please Enter the URL in full standard format:") #We use urlopen to open that URL, and we decode and store the URL value in rw in html format response = request.urlopen(url)
def tokens(text): return re.findall('[a-z]+', text.lower())
f=open("SimLex999-100.txt","r") lines=f.readlines() #lines=str(lines) result=[] from nltk import re lis1=[] for i in range(0,len(lines)) : lis1=lis1+lines[i].split('\t') lis1 strtry=str(lis1) strtry=" ".join(re.findall("[a-zA-Z]{2,}", strtry)) #print(strtry) las=strtry.split(' ') las sent1 = list() sent2 = list() index = 0 for letter in las: if index % 2 != 0: sent2.append(letter) else: sent1.append(letter) index += 1 from nltk.corpus import wordnet syn_name1 = list() for i in range(0,len(sent1)): syn = wordnet.synsets(sent1[i])[0] syn_name1.append(syn.name()) syn_name1 from nltk.corpus import wordnet syn_name2 = list()