def __call__(self,doc): sentences = sentence_tokenizer(doc) doc2 = [] for sent in sentences: sent = [self.modify_word(w) for w in sent] doc2.append(' '.join(sent)) doc2 = '\n'.join(doc2) return doc2
def __call__(self, doc): sentences = sentence_tokenizer(doc) doc2 = [] for sent in sentences: sent = [self.modify_word(w) for w in sent] doc2.append(' '.join(sent)) doc2 = '\n'.join(doc2) return doc2
def __call__(self,doc): sents = sentence_tokenizer(doc) doc2 = [] for sent in sents: if not is_any_lowercase(sent): if len(sent)>4: print "DECAPING: '{}'".format( ' '.join(sent) ) sent = map(unicode.lower, sent) doc2.append(' '.join(sent)) doc2 = ' '.join(doc2) return doc2
def __call__(self, doc): sents = sentence_tokenizer(doc) doc2 = [] for sent in sents: if not is_any_lowercase(sent): if len(sent) > 4: print("DECAPING: '{}'".format(' '.join(sent))) sent = map(unicode.lower, sent) doc2.append(' '.join(sent)) doc2 = ' '.join(doc2) return doc2
def __call__(self, text): ''' Runs the parser. Args: text: a string document Returns: doc2: a string document ''' sents = sentence_tokenizer(text) doc2 = [] for sent in sents: if not is_any_lowercase(sent): if len(sent) > self.min_length: self.logger.info("DECAPING: '{}'".format(' '.join(sent))) sent = [x.lower() for x in sent] doc2.append(' '.join(sent)) doc2 = ' '.join(doc2) return doc2
for row in query: unicodedata.normalize("NFKD", row[0].strip()) # Get rid of formatting and compact older reviews data.append(re.sub('\s+', ' ', row[0])) return data pitchfork = normalize_corpus(pitchfork) # Create sentence tokens using sentence_tokenizer from local tokenizer # module. Stores sentence tokens in list of list of reviews # Sentences object can be used for processing but we'll write out a text file # Text file can be streamed to interator to feed gensim word2vec model # Reading back in will also make sure we're fully unicode regularized pitchfork_sentences = sentence_tokenizer(pitchfork) # There are 503 sentences with escapes followed by non-word characters # Hand sampling / search shows they are all embedded in the DB and site text regex = re.compile(r'\\[a-z]') errors = [i for i in pitchfork_sentences if regex.search(i)] len(errors) # 10 sentences have \t, javascript errors in old reviews, # all quoting outside text. Those are of no consequence regex = re.compile(r'\\t') errors = [i for i in pitchfork_sentences if regex.search(i)] len(errors) pitchfork_sentences = [i for i in pitchfork_sentences if not regex.search(i)]