def sentiment_analysis(message): actual_range = 2 final = [] message = re.sub("(@[A-Za-z0-9]+)|( RT)|( rt)|(\w+:\/\/\S+)"," ",message).strip() #filter usernames,urls message = re.sub('#',"",message) message = filter(lambda x: x in string.printable, message) #filter non printable characters message = HTMLParser.HTMLParser().unescape(message) #unescape html tokenized = tokenize(message,puctuation='.!?:') tokenized = filter(bool,tokenized) tok1=[] for index,it in enumerate(tokenized): mod = mood(it) if '?' in it or mod=='conditional': continue tok1.append(it.strip()) score = 0.0 possed = [re.split(' ',sentence)for sentence in tok1] possed = [nltk.pos_tag(sentence) for sentence in possed] final = [] for sentence in possed: check = [] for entry in sentence: check.append(list(entry)) final.append(check) range_count=0 for sentence in final: sentence = dictionary_tag(sentence) score = score + sentiment_score(sentence) return score
def __iter__(self): if os.path.isdir(self.fname): filenames = [ os.path.join(self.fname, f) for f in os.listdir(self.fname) ] else: filenames = [self.fname] for filename in filenames: with io.open(filename, encoding='utf-8') as f: squad = json.load(f) print "Loaded data of len", len(squad['data']) for d in squad['data']: if self.mode == "squad": yield [self.begin] + list(d["sentence"]) + [ self.middle ] + list(d["question"]) + [self.end], list( d["answer"]) + [self.end] elif self.mode == "squad_word": yield [self.begin ] + tokenize(d["sentence"])[0].split(" ") + [ self.middle ] + tokenize(d["question"])[0].split(" ") + [ self.end ], tokenize( d["answer"])[0].split(" ") + [self.end] elif self.mode == "squad_ptr": yield [self.begin] + list(d["sentence"]) + [ self.middle ] + list(d["question"]) + [self.end], list( d["answer"]) + [self.end]
def opinioncheck(line): sentences = tokenize(line) for s in sentences: tokens= tokenize(s) # print tokens for token in tokens: for word in token.split(): if word in poslist: posop.append(line) if word in neglist: negop.append(line)
def __iter__(self): if os.path.isdir(self.fname): filenames = [ os.path.join(self.fname, f) for f in os.listdir(self.fname) ] else: filenames = [self.fname] for filename in filenames: with open(filename) as f: doc = f.read() if self.mode == "oedilf": toks = [self.begin] for i, line in enumerate(doc.split("\n")): if not line: continue line = ''.join([ char for char in line.lower() if char in "qwertyuioplkjhgfdsazxcvbnm " ]) line_toks = ' '.join(tokenize(line)).split(" ") + [ '<br' + str(i) + '>' ] toks += [tok for tok in line_toks if tok != ''] yield toks + [self.end] if self.mode == "oedilf_rhymes": toks = [self.begin] for i, line in enumerate(doc.split("\n")): if not line: continue line = ''.join([ char for char in line.lower() if char in "qwertyuioplkjhgfdsazxcvbnm " ]) line_toks = ' '.join(tokenize(line)).split( " ")[-1:] + ['<br' + str(i) + '>'] toks += [tok for tok in line_toks if tok != ''] yield toks + [self.end] if self.mode == "oedilf_s2s": history = [] for i, line in enumerate(doc.split("\n")): if not line: continue line = ''.join([ char for char in line.lower() if char in "qwertyuioplkjhgfdsazxcvbnm " ]) line_toks = ' '.join(tokenize(line)).split(" ") + [ '<br' + str(i) + '>' ] line_toks = [tok for tok in line_toks if tok != ''] yield [self.begin] + history + [ self.end ], line_toks + [self.end] history += line_toks
def test_tokenize(self): # Assert list with two sentences. # The tokenizer should at least handle common abbreviations and # punctuation. v = en.tokenize("The cat is eating (e.g., a fish). Yum!") self.assertEqual(v, ["The cat is eating ( e.g. , a fish ) .", "Yum !"]) print("pattern.en.tokenize()")
def form_sentences(self, text_block, block_id, remove_stopwords=False, stem=True, form_tagged_doc=True): """ parse a block of text a form a list of word tokenized sentences :param text_block : single block of text as string :param block_id: id of the text block :param id : id of the text_block, used for hdfs storage :param remove_stopwords: remove the stopwords from the text :param stem: stem the words to root form :param form_tagged_doc: form a tagged document for the Doc2vec model """ sentences = pattern.tokenize(text_block.lower()) sentences = [sentence.replace('\'', '').replace('(', ' ').replace(')', ' ') \ .replace("/", " or ").replace("-", "") for sentence in sentences] sentences = [self.sentence_func(TAG_RE.sub('', sentence)) for sentence in sentences] l_stemmer = lambda w: self.stemmer(w) if stem else w sentences = [[l_stemmer(w) for w in word_tokenize(sentence) if self.__word_filter(w, remove_stopwords)] for sentence in sentences] if not form_tagged_doc: return sentences sentences = [TaggedDocument(words=words, tags=[str(block_id) + ' ' + str(index)]) for index, words in enumerate(sentences)] for sentence in sentences: self.doc_tags[sentence.tags[0]] = sentence return sentences
def word_ranking(text, n='L2'): """ extract most relevant sentences from text according to LSA algorithm steps: 1. tokenize text by sentences 2. compute tfidf matrix 3. applying SVD of tfidf matrix (reduce to n-dimensions) 4. ranking sentences according to cross-method (source: http://www.aclweb.org/anthology/C10-1098.pdf) - text: string consisting of a few sentences - n: number of sentences to extract """ # tokenize text to sentences list sentences = tokenize(text) #============================================================================== # #synctatic filter # exclude_list = [] # for sent in sentences: # for word, pos in tag(sent): # if pos != "JJ" or pos != 'NN': # Retrieve all adjectives and nouns. # exclude_list.append(word.lower()) #============================================================================== # create documents list # stop words and punctuation erase by default docs = [Document(sentences[i], name=i) for i in range(len(sentences))] # model initialize m = Model(docs, weight=TFIDF) # dimensions number equal to euclidean norm of singular values # U, S, Vt = np.linalg.svd(m.vectors, full_matrices=False) # dimensions=int(round(np.linalg.norm(S, 2))) m.reduce(dimensions=n) # sentences selection according to cross-method # source: http://www.ceng.metu.edu.tr/~e1395383/papers/TextSummarizationUsingLSA(Journal).pdf # topic(rows) x tokens(cols) matrix(tfidf) V = np.array(m.lsa.vt) # average sentence score for each concept/topic by the rows of the Vt matrix avg_score = np.mean(V, axis=1).reshape((-1, 1)) # cell values which are less than or equal to the average score are set to zero V[V <= avg_score] = 0.0 # sigma natrix after svd performing S = np.array(m.lsa.sigma).reshape((-1, 1)) # total length of each sentence vector length = np.sum(V * S, axis=0) # ranking words by length score ranking = Counter(dict(zip(m.lsa.terms, length))) #.most_common(n) #words, score = list(zip(*ranking)) return ranking
def sentence_walk(self): output = [] sents = tokenize(self.source_text) words = set(search.hypernym_search(self.source_text, "artifact")) pat = re.compile(" " + "|".join(words) + " ") sents = [s for s in sents if pat.search(s) != None] pprint(sents)
def dispersion(text, keywords): """ Dispersion of occurence of given keywords among given text - text: string - keywords: list of keywords """ # tokenize text to sentences list sentences = tokenize(text) # tokenize by words tokens = [] for sent in sentences: for w in sent.lower().split(): tokens.append(w) n_tokens = len(tokens) n_words = len(keywords) disp = [] for x in range(n_tokens): for y in range(n_words): if tokens[x] == keywords[y]: disp.append((x, y)) x, y = list(zip(*disp)) return x, y
def summarize(text_to_summarize): stokens = tokenize(text_to_summarize) # STEP 1 # pattern.vector's Document is a nifty bag-o-words structure, # with a TF weighting scheme docs = [Document(string= s, name=e,stemmer=LEMMA) for e,s in enumerate(stokens) if len(s.split(" ")) > 7] linkgraph = [] # STEP 2 and 3 happen interwovenly for doc in docs: for doc_copy in docs: if doc.name != doc_copy.name: # STEP 2 happens here wordset_a = [x[1] for x in doc.keywords()] wordset_b = [y[1] for y in doc_copy.keywords()] jacc_dist = distance.jaccard(wordset_a, wordset_b) if jacc_dist < 1: linkgraph.append((str(doc.name), #index to sentence str(doc_copy.name),1-jacc_dist)) #dist. score # By the time we reach here, we'd have completed STEP 3 # STEP 4 #I referenced this SO post for help with pagerank'ing #http://stackoverflow.com/questions/9136539/how-to-weighted-edges-affect-pagerank-in-networkx D=nx.DiGraph() D.add_weighted_edges_from(linkgraph) pagerank = nx.pagerank(D) sort_pagerank = sorted(pagerank.items(),key=operator.itemgetter(1)) sort_pagerank.reverse() top2 = sort_pagerank[:2] orderedtop2 = [int(x[0]) for x in top2] orderedtop2 = sorted(orderedtop2) return " ".join([ stokens[i] for i in orderedtop2 ])
def summarize(text, n=2): """ determine most informative sentences by summarizing words ranks which occure in the corresponding sentences """ # tokenize text to sentences list sentences = tokenize(text) # tokenize sentence list by words words_sent = [sent.lower().split() for sent in sentences] # words ranking w_ranking = word_ranking(text, n) # sents ranking = sum of words score s_ranking = defaultdict(int) for i, sent in enumerate(words_sent): for word in sent: if word in w_ranking: s_ranking[i] += w_ranking[word] # placed sents ranking into high-performance container s_ranking = Counter(s_ranking) # get top n sents indexes with scores sents_idx = s_ranking.most_common(n) output = [sentences[j[0]] for j in sents_idx] # reordering output.sort(lambda s1, s2: text.find(s1) - text.find(s2)) return ' '.join(output)
def do_POST(self): form = cgi.FieldStorage(fp=self.rfile, headers=self.headers, environ={ 'REQUEST_METHOD': 'POST', 'CONTENT_TYPE': self.headers['Content-Type'], }) if self.path != '/predict' or 'text' not in form.keys(): self.send_response(404) self.end_headers() return 404 self.send_response(200) self.send_header("Content-type", 'text/plain') self.end_headers() text = ' '.join( tokenize( re.sub('([a-z][.!?]+)([A-Z])', '\g<1> \g<2>', form['text'].value, 0))).lower().split() x = [[w2indx.get(word, 0) for word in text]] x = sequence.pad_sequences(x, maxlen=200, padding='post', truncating='post') predict = model.predict_classes(x)[0][0] self.wfile.write(bytes(LBL[predict], encoding='utf8')) return 200
def sentiment_analysis(message): actual_range = 2 final = [] message = re.sub("(@[A-Za-z0-9]+)|( RT)|( rt)|(\w+:\/\/\S+)", " ", message).strip() #filter usernames,urls message = re.sub('#', "", message) message = filter(lambda x: x in string.printable, message) #filter non printable characters message = HTMLParser.HTMLParser().unescape(message) #unescape html tokenized = tokenize(message, puctuation='.!?:') tokenized = filter(bool, tokenized) tok1 = [] for index, it in enumerate(tokenized): mod = mood(it) if '?' in it or mod == 'conditional': continue tok1.append(it.strip()) score = 0.0 possed = [re.split(' ', sentence) for sentence in tok1] possed = [nltk.pos_tag(sentence) for sentence in possed] final = [] for sentence in possed: check = [] for entry in sentence: check.append(list(entry)) final.append(check) range_count = 0 for sentence in final: sentence = dictionary_tag(sentence) score = score + sentiment_score(sentence) return score
def clean_text(text): """ :param text: text as str :return: list of sentences """ try: text = text.strip() if text: final_sentences = [] token_text = tokenize(text) for sentence in token_text: words = sentence.split() cleaned_tokens = [ porter_stemmer.stem(word) for word in words if word not in punctuation ] cleaned_sent = " ".join(cleaned_tokens) cleaned_sent = CleanTextProcessor.clean_not_words( cleaned_sent) cleaned_sentence = cleaned_sent + "." final_sentences.append(cleaned_sentence) return final_sentences else: return [] except: trace_err = StackTrace.get_stack_trace() msg = "CleanTextProcessor (clean_text()) : %s%s" % ("\n", trace_err) log.error(msg) raise Exception(msg)
def summarize(text, n=1): """ extract most relevant sentences from text according to TextRank algorithm - text: string consisting of a few sentences - n: number of sentences to extract """ # tokenize text to sentences list sentences = tokenize(text) # create documents list # stop words and punctuation erase by default docs = [Document(sentences[i], name=i) for i in range(len(sentences))] # model initialize m = Model(docs, weight=TFIDF) # dict of TextRank ranking of cosine similarity matrix ranking = utils.textrank(m.documents, m.distance) # indexes of top n sentences top_sents_idx, _ = list(zip(*ranking.most_common(n))) # reordering output = [sentences[i] for i in sorted(top_sents_idx)] return ''.join(output)
def keywords(text, n=15): """ extract most relevant keywords from given text steps: 1. tokenize text by words 2. applying synctatic filter 3. compute pairwise levenshtein distance 4. create graph based on cosine distance matrix 5. compute pagerank - text: string consisting of a few sentences - n: number of keywords to extract """ # tokenize text to sentences list sentences = tokenize(text) #synctatic filter words = [] for sent in sentences: for word, pos in tag(sent): if pos == "JJ" or pos == 'NN': # Retrieve all adjectives and nouns. words.append(word.lower()) # dict of TextRank ranking of levenshtein distance matrix ranking = utils.textrank(words, utils.levenshtein) # top n keywords keywords, scores = list(zip(*ranking.most_common(n))) return keywords, scores
def sentence_walk(self): output = [] sents = tokenize(self.source_text) words = set(search.hypernym_search(self.source_text, 'artifact')) pat = re.compile(' ' + '|'.join(words) + ' ') sents = [s for s in sents if pat.search(s) != None] pprint(sents)
def ngrams(text, n=1, lowercase=False): for s in tokenize(text): if lowercase: s = s.lower() s = s.split() for i in xrange(n): for j in xrange(len(s)-i): yield ' '.join(s[j:j+i+1])
def split_text_to_list_of_sentences(raw_text): """ Split the raw text into list of sentences. Args: raw_text (str): text input in paragraphs. Returns: (list): list of str of sentences. """ return tokenize(raw_text)
def key_sentences(self): words = set(search.hypernym_search(self.source_text, "instrumentality")) sents = tokenize(self.source_text) pat = re.compile(" " + "|".join(words) + " ") sents = [s for s in sents if pat.search(s) != None] pprint(sents) pprint(words)
def test_findTonkens_3(): s = "I eat pizza with a fork." s = "Bachelor's degree in Computer Science or equivalent" s = "B.S. in Computer Science, a related degree or its equivalent " s = "What's this? This is a book." from pattern.en import tokenize result = tokenize(s) print result
def sentance_break(origin_text): """ Input: output text from gutenberg_text_gather Output: tokenized text, a list of strings where the strings are the sentances """ text = tokenize( origin_text, ) # using patter to break string of text apart in to a list of strings where each string is a sentace return text
def form_sentences(self): f_p = open(CORPUS_FILE, "rbU") corpus_sentences = pattern.tokenize(f_p.read()) f_p.close() self.sentences = defaultdict(list) for sentence in corpus_sentences: for v in VERBS: if sentence.find(" " + v + " ") != -1: self.sentences[v].append(sentence)
def key_sentences(self): words = set(search.hypernym_search(self.source_text, 'instrumentality')) sents = tokenize(self.source_text) pat = re.compile(' ' + '|'.join(words) + ' ') sents = [s for s in sents if pat.search(s) != None] pprint(sents) pprint(words)
def __iter__(self): if os.path.isdir(self.fname): filenames = [os.path.join(self.fname,f) for f in os.listdir(self.fname)] else: filenames = [self.fname] for filename in filenames: with open(filename) as f: doc = f.read() if self.mode == "ohhla": toks = [self.begin] for line in doc.split("\n"): if not line: continue toks += ' '.join(tokenize(line)).split(" ") + ['<br>'] yield toks + [self.end] elif self.mode == "ohhla_line_pairs": lines = [tokenize(line) for line in doc.split("\n")] for l1, l2 in zip(lines, lines[1:]): inp_toks = [self.begin] + ' '.join(l1).split(" ") + [self.end] outp_toks = ' '.join(l2).split(" ") + [self.end] yield (inp_toks, outp_toks)
def tokenize_pattern(text): """ The tokenize() function returns a list of sentences, with punctuation marks split from words. """ sents = tokenize(text, punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~_«»…".decode("utf8"), replace={}) """ Возвращает список предложений вида Теперь , в 2014 году , голая Дженнифер Лоуренс появилась в Интернете за полтора месяца до всемирной премьеры первой части последней серии трилогии « Голодные игры : Сойка-пересмешница » ( The Hunger Games : Mockingjay – Part 1 ) . """ tokens = [token.lower() for sent in sents for token in sent.split()] log.debug("Tokenize with Pattern") return tokens
def __call__(self, org_doc): doc = org_doc tokens = doc.lower().split() ldoc = ' '.join([x for x in tokens if "_" not in x]) # Identify which phrases were used keywords = [key for key in self.X if key in ldoc] punctuation = ".,;:!?()[]{}`''\"@#$^&*+-|=~" # Loop over the keywords and replace them one-by-one. # This is inefficient, but less error prone. parsed_sent = [] for sent in tokenize(doc, punctuation=punctuation): for word in keywords: word_n_tokens = len(word.split()) new_word = self.X[word] word_tokens = word.split() # Check if the substring tokens match tokens = sent.lower().split() mask = contains_sublist(tokens, word_tokens) while any(mask): idx = mask.index(True) sent = sent.split() args = sent[:idx] + [ new_word, ] + sent[idx + word_n_tokens:] sent = ' '.join(args) tokens = sent.lower().split() mask = contains_sublist(tokens, word_tokens) parsed_sent.append(sent) doc = ' '.join(parsed_sent) """ # Change the punctuation to a more readable format for debugging punc_compress = ''').,?!':''' for punc in punc_compress: doc = doc.replace(' '+punc,punc) punc_compress = '''(''' for punc in punc_compress: doc = doc.replace(punc+' ',punc) """ return doc
def __call__(self,org_doc): doc = org_doc tokens = doc.lower().split() ldoc = ' '.join([x for x in tokens if "_" not in x]) # Identify which phrases were used keywords = [key for key in self.X if key in ldoc] punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~" # Loop over the keywords and replace them one-by-one. # This is inefficient, but less error prone. parsed_sent = [] for sent in tokenize(doc, punctuation=punctuation): for word in keywords: word_n_tokens = len(word.split()) worn_n = len(word) new_word = self.X[word] word_tokens = word.split() # Check if the substring tokens match tokens = sent.lower().split() mask = contains_sublist(tokens, word_tokens) while any(mask): idx = mask.index(True) sent = sent.split() args = sent[:idx] + [new_word,] + sent[idx+word_n_tokens:] sent = ' '.join(args) tokens = sent.lower().split() mask = contains_sublist(tokens, word_tokens) parsed_sent.append(sent) doc = ' '.join(parsed_sent) """ # Change the punctuation to a more readable format for debugging punc_compress = ''').,?!':''' for punc in punc_compress: doc = doc.replace(' '+punc,punc) punc_compress = '''(''' for punc in punc_compress: doc = doc.replace(punc+' ',punc) """ return doc
def __call__(self, data): splitted_body = self.get_enrichment(data, 'sentence_splitter') tokenized = [] for paragraph in splitted_body: if 'content' in paragraph and paragraph['content']: # Tokenize the splitted sentences and # join potential sentence splits detected by pattern tokenized_sentences = [' '.join(tokenize(s)) for s in paragraph['content']] tokenized.append({'content': tokenized_sentences, 'type': paragraph['type']}) return self.add_enrichment(data, self.name, tokenized)
def __iter__(self): for root, dirs, files in os.walk(self.dirname): for filename in files: file_path = root + '/' + filenam for line in open(file_path): sline = line.strip() if sline == "": continue rline = cleanhtml(sline) tokenized_line = ' '.join(tokenize(rline)) is_alpha_word_line = [word for word in tokenized_line.lower().split() if word.isalpha()] yield is_alpha_word_line
def __iter__(self): for root, dirs, files in os.walk(self.dirname): for filename in files: file_path = root + '/' + filename for line in open(file_path): sline = line.strip() if sline == "": continue rline = clean_html(sline) tokenized_line = r' '.join(tokenize(rline)) is_alpha_word_line = [ word for word in jieba.cut(tokenized_line, cut_all=False) if word.isalpha() ] yield is_alpha_word_line
def form_sentences(self, text_block, remove_stopwords=False, stem=True): """ parse a block of text a form a list of word tokenized sentences :param text_block : single block of text as string :param id : id of the text_block, used for hdfs storage :param remove_stopwords: remove the stopwords from the text :param stem: stem the words to root form """ sentences = pattern.tokenize(text_block.lower()) sentences = [sentence.replace('\'', '').replace('(', ' ').replace(')', ' ') \ .replace("/", " or ").replace("-", "") for sentence in sentences] sentences = [self.sentence_func(TAG_RE.sub('', sentence)) for sentence in sentences] l_stemmer = lambda w: self.stemmer(w) if stem else w sentences = [[l_stemmer(w) for w in word_tokenize(sentence) if self.__word_filter(w, remove_stopwords)] for sentence in sentences] return sentences
def _transform_file(file_path, w2id, split_par=False, debug=False): """ Transforms a file containing articles into a 4D list of words divided into sentences, paragraphs and docs. Write the result to disk with the name filename_clean.pklz :param file_path: file to transform """ if debug: print("Cleaning %s" % file_path) with open(file_path) as f: data = f.read().decode("latin-1") docs = data.split("</doc>") del data if not split_par: file_out = "%s_clean_simple" % file_path else: file_out = "%s_clean_paragraph" % file_path file_string = "" for doc in [d.strip() for d in docs if d.strip()]: paragraphs = [ tokenize(par) for par in remove_title(cleanhtml(doc)).strip().split("\n\n") if par ] doc_a = False for p in paragraphs: par_a = False for sent in p: line = [ word for word in sent.lower().split() if word.isalpha() or is_number(word) ] line = " ".join([known(word, w2id) for word in line]) if line: file_string += line + " <eos> " par_a = True if par_a and split_par: file_string += " <eop> " VectorManager.write_string(file_out, file_string.encode("latin-1")) del file_string if debug: print("Done with %s" % file_path)
def _transform_file(file_path, debug=False): """ Transforms a file containing articles into a 4D list of words divided into sentences, paragraphs and docs. Write the result to disk with the name filename_wl (words list) :param file_path: file to transform """ if debug: print("Cleaning %s" % file_path) with open(file_path) as f: raw = f.read().decode("latin-1") data = cleanhtml(raw) docs = data.split("</doc>") del data file_out = "%s_wl" % file_path file_string = "" for doc in [d.strip() for d in docs if d.strip()]: paragraphs = [ tokenize(par) for par in remove_title(cleanhtml(doc)).strip().split("\n\n") if par ] doc_a = False for p in paragraphs: par_a = False for sent in p: line = " ".join([ word for word in sent.lower().split() if word.isalpha() or is_number(word) ]) if line: file_string += line + "\n" par_a = True doc_a = True if par_a: file_string += "\n" if doc_a: file_string += "\n" VectorManager.write_string(file_out, file_string.encode("latin-1")) del file_string if debug: print("Done with %s" % file_path)
def get_raw_text(self): """ gutenberg_text_gather take a text from gutenberg url and stores it to a file. It only pulls from gutenberg when given the command True. By default the command is False. This function outputs self.raw_text, which is a tokenized text file of my gutenberg book. """ if self.command: # If I tell it to load data from url buddhist_psalm_text = URL(self.url).download() # Save data to a file (will be part of your data fetching script) f = open('buddhist_psalm_text.pickle','wb') pickle.dump(buddhist_psalm_text,f) f.close() # Load data from a file (will be part of your data processing script) input_file = open('buddhist_psalm_text.pickle','rb') # Use pattern to break string of text in to a list of strings where each string is a sentace self.raw_text = tokenize(pickle.load(input_file),)
def form_sentences(self, text_block, block_id, remove_stopwords=False, stem=True, form_tagged_doc=True): """ parse a block of text a form a list of word tokenized sentences :param text_block : single block of text as string :param block_id: id of the text block :param id : id of the text_block, used for hdfs storage :param remove_stopwords: remove the stopwords from the text :param stem: stem the words to root form :param form_tagged_doc: form a tagged document for the Doc2vec model """ sentences = pattern.tokenize(text_block.lower()) sentences = [sentence.replace('\'', '').replace('(', ' ').replace(')', ' ') \ .replace("/", " or ").replace("-", "") for sentence in sentences] sentences = [ self.sentence_func(TAG_RE.sub('', sentence)) for sentence in sentences ] l_stemmer = lambda w: self.stemmer(w) if stem else w sentences = [[ l_stemmer(w) for w in word_tokenize(sentence) if self.__word_filter(w, remove_stopwords) ] for sentence in sentences] if not form_tagged_doc: return sentences sentences = [ TaggedDocument(words=words, tags=[str(block_id) + ' ' + str(index)]) for index, words in enumerate(sentences) ] for sentence in sentences: self.doc_tags[sentence.tags[0]] = sentence return sentences
def text_sentiment(text): if not text: return default_sentiment sentences = tokenize(plaintext(text)) sentiments = [sentiment(s) for s in sentences] average_polarity = np.mean([s[0] for s in sentiments]) std_polarity = np.std([s[0] for s in sentiments]) average_subjectivity = np.mean([s[1] for s in sentiments]) std_subjectivity = np.std([s[1] for s in sentiments]) if math.isnan(average_polarity): average_polarity = 0.0 if math.isnan(std_polarity): std_polarity = 0.0 if math.isnan(average_subjectivity): average_subjectivity = 0.0 if math.isnan(std_subjectivity): std_subjectivity = 0.0 return Sentiment(average_polarity, std_polarity, average_subjectivity, std_subjectivity, len(sentences))
def form_sentences(self, text_block, remove_stopwords=False, stem=True): """ parse a block of text a form a list of word tokenized sentences :param text_block : single block of text as string :param id : id of the text_block, used for hdfs storage :param remove_stopwords: remove the stopwords from the text :param stem: stem the words to root form """ sentences = pattern.tokenize(text_block.lower()) sentences = [sentence.replace('\'', '').replace('(', ' ').replace(')', ' ') \ .replace("/", " or ").replace("-", "") for sentence in sentences] sentences = [ self.sentence_func(TAG_RE.sub('', sentence)) for sentence in sentences ] l_stemmer = lambda w: self.stemmer(w) if stem else w sentences = [[ l_stemmer(w) for w in word_tokenize(sentence) if self.__word_filter(w, remove_stopwords) ] for sentence in sentences] return sentences
def summarize(text, sentence_count=2): sentence_list = tokenize(text) # each document's name is the sentence's original index # so that we can put them back together later docs = [Document(string=sentence, name=index, stemmer=LEMMA) for index, sentence in enumerate(sentence_list)] graph = Graph() for doc_a, doc_b in combinations(docs, 2): wordset_a = [x[1] for x in doc_a.keywords()] wordset_b = [y[1] for y in doc_b.keywords()] similarity = 1 - jaccard(wordset_a, wordset_b) if similarity > 0: graph.add_edge(doc_a.name, doc_b.name, weight=similarity) ranked_sentence_indexes = pagerank(graph).items() sentences_by_rank = sorted( ranked_sentence_indexes, key=itemgetter(1), reverse=True) best_sentences = map(itemgetter(0), sentences_by_rank[:sentence_count]) best_sentences_in_order = sorted(best_sentences) return ' '.join(sentence_list[index] for index in best_sentences_in_order)
def test_tokenize(): from pattern.en import tokenize sent = "Randstad Technologies - Baltimore , MD - June 2014 to Present Responsibilities Johns Hopkins University , Krieger School of Arts & Sciences June 2014 - present Input Content for websites using the WordPress interface Modified and configured WordPress plug-ins and themes to match design Created Email template for Dean 's Newsletter Launched website and created redirects using .htaccess and Apache conf file" lines = tokenize(sent) print lines
def form_relations(self, text, block_id, payload, ff, persist=True): """ form relation(s) on a given text :param text: text on which to get the relations on, text will be sentence tokenized and relations formed at sentence level :param block_id: unique identifier of the block :param persist: persist the relations extracted from the text in the sink, relation_sink needed to be specified :return: list of relations """ text_sentences = pattern.tokenize(text) relations = [] for sentence in text_sentences: # work with ascii string only sentence = "".join((c for c in sentence if 0 < ord(c) < 127)) try: senna_annotation = self.relation_annotator.getAnnotations(sentence) except Exception as e: logger.error(e) continue chunk_parse, pos_tags, role_labeling, tokenized_sentence = \ senna_annotation['chunk'], senna_annotation['pos'], senna_annotation['srl'], \ senna_annotation['words'] # nothing to do here empty srl if not role_labeling: continue for semantic_element in role_labeling: arguments = RelationExtractor.__populate_arguments(semantic_element) modifiers = RelationExtractor.__populate_modifier(semantic_element) verb = semantic_element.get('V') # order of the arguments returned is important, A0 --> A1 --> A2 --> A3 arguments = [v for v in vars(arguments).itervalues() if v] modifiers = [v for v in vars(modifiers).itervalues() if v] if not arguments: continue argument_pairs = [e for e in ((ai, aj) for i, ai in enumerate(arguments) for j, aj in enumerate(arguments) if i < j)] verb = relation_util.normalize_relation(verb) for a0, a1 in argument_pairs: en0 = relation_util.form_entity(tokenized_sentence, a0, chunk_parse, pos_tags) en1 = relation_util.form_entity(tokenized_sentence, a1, chunk_parse, pos_tags) if not en0 or not en1: continue relations.append(RelationTuple(left_entity=en0, right_entity=en1, relation=verb, sentence=sentence, text=text, block_id=block_id, payload=payload, ff = ff)) logger.info("generated a relation for ") logger.info(block_id) for arg_modifier in modifiers: mod_pos = sentence.find(arg_modifier) linked_arg = min([(a, abs(mod_pos - sentence.find(a))) for a in arguments], key=lambda e: e[1])[0] en0 = relation_util.form_entity(tokenized_sentence, linked_arg, chunk_parse, pos_tags) en1 = relation_util.form_entity(tokenized_sentence, arg_modifier, chunk_parse, pos_tags) if not en0 or not en1: continue relations.append(RelationTuple(left_entity=en0, right_entity=en1, relation=verb, sentence=sentence, text=text, block_id=block_id, payload=payload, ff=ff)) logger.info("generated a relation for ") logger.info(block_id) return relations
for line in neg: for v in line.split("\n"): if v: if v[0] != ';': neglist.append(v.strip()) print poslist print neglist print lines poslist = filter(None, poslist) neglist = filter(None, neglist) for line in lines: sentences = tokenize(line) for s in sentences: tokens= tokenize(s) for word in tokens: if word in poslist: posop.append(line) elif word in neglist: negop.append(line) posop=list(set(posop)) negop=list(set(negop)) print "positive" for p in posop: print p print "negative"
for line in neg: for v in line.split("\n"): if v: if v[0] != ';': neglist.append(v.strip()) print poslist print neglist print lines poslist = filter(None, poslist) neglist = filter(None, neglist) for line in lines: sentences = tokenize(line) for s in sentences: tokens = tokenize(s) for word in tokens: if word in poslist: posop.append(line) elif word in neglist: negop.append(line) posop = list(set(posop)) negop = list(set(negop)) print "positive" for p in posop: print p print "negative"
def splitSentences(text): # return nltk.tokenize.sent_tokenize(text) # use pattern package return tokenize(text)
verb=' '.join(bits_to_words(basic_sentence['VP'])) if verb=='is': return "What is "+sbj.lower()+"? "+obj return "What does "+sbj.lower()+" "+lemma(verb.lower())+"?"+" "+obj text=""" A star is a massive ball of plasma (very hot gas) held together by gravity. It radiates energy because of the nuclear reactions inside it It radiates heat and light, and every other part of the electromagnetic spectrum, such as radio waves, micro-waves, X-rays, gamma-rays and ultra-violet radiation. The proportions vary according to the mass and age of the star. The energy of stars comes from nuclear fusion. This is a process that turns a light chemical element into another heavier element. Stars are mostly made of hydrogen and helium. They turn the hydrogen into helium by fusion. When a star is near the end of its life, it begins to change the helium into other heavier chemical elements, like carbon and oxygen. Fusion produces a lot of energy. The energy makes the star very hot. The energy produced by stars radiates away from them. The energy leaves as electromagnetic radiation. """ sentences=tokenize(text) basic_sentences=[] for sentence in sentences: print sentence basic_sentences=basic_sentences+gather_question_bits(sentence) basic_sentences=convert_pp(basic_sentences) for sentence in basic_sentences: print basic_sentence_to_question(sentence)
#refer to http://textminingonline.com/getting-started-with-pattern from pattern.en import tokenize f = """this’s pattern word tokenize""" print "tokens:", tokenize(f) sent_tokenize_test = """Tokenization is the process of breaking a stream of text up into words, phrases, symbols, or other meaningful elements called tokens. The list of tokens becomes input for further processing such as parsing or text mining. Tokenization is useful both in linguistics (where it is a form of text segmentation), and in computer science, where it forms part of lexical analysis.""" print "sentence:",tokenize(sent_tokenize_test) from pattern.en import tag g = """In corpus linguistics, part-of-speech tagging (POS tagging or POST), also called grammatical tagging or word-category disambiguation, is the process of marking up a word in a text (corpus) as corresponding to a particular part of speech, based on both its definition, as well as its context—i.e. relationship with adjacent and related words in a phrase, sentence, or paragraph. A simplified form of this is commonly taught to school-age children, in the identification of words as nouns, verbs, adjectives, adverbs, etc.""" tagged_result = tag(g) print tagged_result from pattern.en import referenced referenced('book') from pattern.en import singularize singularize('wolves') from pattern.en import comparative comparative('bad') #‘worse’ from pattern.en import superlative
text = str(text).replace('\xf7','').replace('\xc3\xba','').replace('\xb6','').replace('\xa9','').replace('\xe2\x99\xaa','') text = str(text).replace('\xc3\xaf','').replace('\x5c','').replace('\xf1','').replace('\xe1','').replace('\xe7','').replace('\xfa','') text = str(text).replace('\xf3','').replace('\xed','').replace('\xe9','').replace('\xe0','').replace('\xae','').replace('\xc2','') text = str(text).replace('\xc3','').replace('\xa2','').replace('\xbf','') # print text except IndexError: print line continue # G. Remove clearly wrong unicode characters -- BOM, NULL (only utf8 hex works) line = str(line).replace('\x00 ','').replace('\xef\xbf\xbd','') print line, # H. Ensure the text is split into sentences # tokenize(string, punctuation=".,;:!?()[]{}`''\"@#$^&*+-|=~_", replace={}) for sentence in tokenize(text): all = "" # I. Select the parser if sentence.isupper() or sentence.islower(): st = UPP else: st = Mix # J. Parts of speech with stanford-ner via pyner reply = st.get_entities(sentence) # {u'PERSON': [u'Bill Clinton'], u'LOCATION': [u'U.S.'], u'O': [u'was President of the']} try: for tup in reply.items(): names = "" if tup[0] == "O" or not tup[0] : continue for name in tup[1]: names = "".join([names,"/",name])