def tokenize_merge(row): allwords = [] for text in row.iloc[1:].dropna(): text = text.lstrip("b\'").lstrip("b\"").lstrip("b\'''") s = Sentence.from_raw(text,StopWords,neg_mark=True) allwords += s.words print allwords# show progress return allwords
def test_negation_suffix(): stopwords = common.make_stop_words() sentences = [ "I don't like Beijing 123, because it's too expensive", "I cannot 4 run away 56, since I am a grown man", "never ever come back again, I swear to god","without any problem","I don't think I will enjoy it: it might be too spicy" ] for index,raw_sent in enumerate(sentences): sentence = Sentence.from_raw(raw_sent,stopwords) print "\n=========================== [{}]".format(index+1) print sentence.raw print sentence.words
def tokenize_merge(row): allwords = [] for text in row.iloc[1:].dropna(): text = text.lstrip("b\'").lstrip("b\"").lstrip("b\'''") s = Sentence.from_raw(text, StopWords, neg_mark=True) allwords += s.words print allwords # show progress return allwords
def preproc_save_sentences(filename,raw_sent_stream,extra_stopwords = None): stop_words = set(stopwords.words("english")) if extra_stopwords is not None: stop_words |= set(extra_stopwords) with open(filename,"wt") as outf: outf.write("[") for index,raw_sent in enumerate( raw_sent_stream): prev_terminator = '\n' if index ==0 else ',\n' sentence = Sentence.from_raw(raw_sent,stop_words) if len(sentence.words)>0: outf.write(prev_terminator + sentence.dump_json()) print "{}-th sentence processed and saved".format(index+1) outf.write("\n]")
def test_sentence(): stopwords = text_utility.make_stop_words() texts = [ "can't is a contraction", "she isn't my wife any more", "I am not in USA right now", "I'm a Chinese", "1630 NE Valley Rd, Pullman, WA, 99163, Apt X103", "I should've done that thing I didn't do", "I don't love her any more", "I want to divorce without hesitation", "bye, Pullman, bye, USA" ] for index, text in enumerate(texts): sent = Sentence.from_raw(text, stopwords, True) print "\n******************** {}".format(index + 1) print sent.raw print "===>" print sent.words
def test_sentence(): stopwords = text_utility.make_stop_words() texts = [ "can't is a contraction", "she isn't my wife any more", "I am not in USA right now", "I'm a Chinese", "1630 NE Valley Rd, Pullman, WA, 99163, Apt X103", "I should've done that thing I didn't do", "I don't love her any more", "I want to divorce without hesitation", "bye, Pullman, bye, USA"] for index,text in enumerate(texts): sent = Sentence.from_raw(text,stopwords,True) print "\n******************** {}".format(index+1) print sent.raw print "===>" print sent.words
def print_topics(txt): sentence = Sentence.from_raw(txt,stop_words) print "\n{}\n".format(sentence.raw) coded_words = wordcoder.code(sentence.words) bow = dictionary.doc2bow(coded_words) topic_distribution = lda_model[bow] topic_distribution.sort(key=lambda t: t[1], reverse=True) tags = None for index, (topic_id, topic_percentage) in enumerate(topic_distribution): mt = MixTopic(topic_mapping[topic_id]) mt.weight(topic_percentage) if tags is None: tags = mt else: tags.add(mt) tags.normalize() print tags
def update_add_neg_suffix(dbname,query_condition): stop_words = common.make_stop_words() client = MongoClient() review_collection = client[dbname]['reviews'] cursor = review_collection.find(query_condition,{"sentences.raw":1,"sentences.words":1}) for rindex,rd in enumerate(cursor): review = Review.from_dict(rd) update_content = {} for sindex,sent in enumerate(review.sentences): new_sent = Sentence.from_raw(sent.raw,stop_words) if set(new_sent.words) != set(sent.words): update_content["sentences.{}.words".format(sindex)] = new_sent.words if len(update_content)>0: result = review_collection.update_one({"_id":review.id},{"$set":update_content}) if result.modified_count != 1: raise Exception("failed to update review<{}>".format(review.id)) print "{}-th review updated {} sentences".format(rindex+1,len(update_content)) client.close()
def __init__(self, id=None, text=None, is_positive=None): self.id = id self.sent = None if text is None else Sentence.from_raw( text, Review.StopWords, neg_mark=True) if self.sent is not None: self.sent.sentiment = is_positive
def __init__(self,id = None,text = None,is_positive=None): self.id = id self.sent = None if text is None else Sentence.from_raw(text,Review.StopWords,neg_mark=True) if self.sent is not None: self.sent.sentiment = is_positive
def assign_comment(self,text,stop_words): self.sentences = [] for raw_sentence in Review.SentTokenizer.tokenize(text): sent = Sentence.from_raw(raw_sentence,stop_words) if len(sent.words) >0: self.sentences.append(sent)