def save(self, manually_splitting=False, source_sentences=()): if not manually_splitting: # Tokenize the HTML that is fetched from a wiki article sentences = list() segment_id = 0 soup = BeautifulSoup(self.source_text) sentence_splitter = determine_splitter(self.language) # initial save for foreign key based saves to work # save should occur after sent_detector is loaded super(SourceArticle, self).save() # find all paragraphs for p in soup.findAll('p'): only_p = p.findAll(text=True) p_text = ''.join(only_p) # split all sentences in the paragraph sentences = sentence_splitter(p_text.strip()) # TODO: remove bad sentences that were missed above sentences = [s for s in sentences if not re.match("^\**\[\d+\]\**$", s)] for sentence in sentences: # Clean up bad spaces ( ) sentence = sentence.replace(" ", " ") s = SourceSentence(article=self, text=sentence, segment_id=segment_id) segment_id += 1 s.save() s.end_of_paragraph = True s.save() self.sentences_processed = True else: for sentence in source_sentences: sentence.save() super(SourceArticle, self).save()
def save(self, manually_splitting=False): if manually_splitting: print 'woo' else: sentences = list() segment_id = 0 soup = BeautifulSoup(self.source_text) sentence_splitter = determine_splitter(self.language) # initial save for foriegn key based saves to work # save should occur after sent_detector is loaded super(SourceArticle, self).save() for p in soup.findAll('p'): only_p = p.findAll(text=True) p_text = ''.join(only_p) for sentence in sentence_splitter(p_text.strip()): s = SourceSentence(article=self, text=sentence, segment_id=segment_id) segment_id += 1 s.save() s.end_of_paragraph = True s.save() self.sentences_processed = True print 'James :: %s' % self super(SourceArticle, self).save()
def save(self): sentences = list() segment_id = 0 #soup = BeautifulSoup(self.source_text) sentence_splitter = determine_splitter(self.language) # initial save for foriegn key based saves to work # save should occur after sent_detector is loaded super(SourceArticle, self).save() #for p in soup.findAll('p'): # only_p = p.findAll(text=True) # p_text = ''.join(only_p) # for sentence in sentence_splitter(p_text.strip()): # s = SourceSentence(article=self, text=sentence, segment_id=segment_id) # segment_id += 1 # s.save() # s.end_of_paragraph = True import sys print >> sys.stderr, 'got here...' for sent,tag in zip(*wiki2sentences(self.source_text,sentence_splitter)): s = SourceSentence(article=self, text=sent, segment_id=segment_id) segment_id += 1 if tag=='LastSentence': s.end_of_paragraph = True s.save() self.sentences_processed = True super(SourceArticle, self).save()
def save(self, manually_splitting=False, source_sentences=()): if not self.sentences_processed and not manually_splitting: soup = BeautifulSoup(self.source_text) sentence_splitter = determine_splitter(self.language.code) super(SourceArticle, self).save() segment_id = 0 for tag in soup.findAll(re.compile('^[ph]')): if re.match('p', tag.name): p_text = ''.join([x.string for x in tag.findAll(text=True) if not re.match('[\[\]\\d]+$', x.string)]) sentences = sentence_splitter(p_text.strip()) for sentence in sentences: sentence = sentence.replace(" ", " ") src_sent = SourceSentence(article=self, text=sentence, segment_id=segment_id, is_heading=False, heading_level=0) segment_id += 1 src_sent.save() src_sent.end_of_paragraph = True src_sent.save() elif re.match('h\d', tag.name): headline = tag.findAll(attrs={'class': 'mw-headline'}) if headline: content = headline[0].string else: content = tag.string if content.lower() == 'weblinks': break src_sent = SourceSentence(article=self, text=content, segment_id=segment_id, is_heading=True, heading_level=int(tag.name[-1])) src_sent.save() segment_id += 1 self.sentences_processed = True else: for sentence in source_sentences: sentence.save() super(SourceArticle, self).save()
def save(self): sentences = list() segment_id = 0 soup = BeautifulSoup(self.source_text) sentence_splitter = determine_splitter(self.language) # initial save for foriegn key based saves to work # save should occur after sent_detector is loaded super(SourceArticle, self).save() for p in soup.findAll('p'): only_p = p.findAll(text=True) p_text = ''.join(only_p) for sentence in sentence_splitter(p_text.strip()): s = SourceSentence(article=self, text=sentence, segment_id=segment_id) segment_id += 1 s.save() s.end_of_paragraph = True s.save() self.sentences_processed = True super(SourceArticle, self).save()
def save(self, manually_splitting=False, source_sentences=()): if not self.sentences_processed and not manually_splitting: # Tokenize the HTML that is fetched from a wiki article sentences = list() segment_id = 0 soup = BeautifulSoup(self.source_text) sentence_splitter = determine_splitter(self.language.code) # initial save for foreign key based saves to work # save should occur after sent_detector is loaded super(SourceArticle, self).save() # find all paragraphs for p in soup.findAll('p'): only_p = p.findAll(text=True) p_text = ''.join(only_p) # split all sentences in the paragraph sentences = sentence_splitter(p_text.strip()) # TODO: remove bad sentences that were missed above sentences = [ s for s in sentences if not re.match("^\**\[\d+\]\**$", s) ] for sentence in sentences: # Clean up bad spaces ( ) sentence = sentence.replace(" ", " ") s = SourceSentence(article=self, text=sentence, segment_id=segment_id) segment_id += 1 s.save() s.end_of_paragraph = True s.save() self.sentences_processed = True else: for sentence in source_sentences: sentence.save() super(SourceArticle, self).save()
def write_lines_to_file(output_filename, lines): """ \ Writes a list of lines to file. \ """ output_file = open(output_filename, 'w') for line in lines: output_file.write(line.encode('UTF-8')) output_file.write('\n'.encode('UTF-8')) output_file.close() return lines #topics = read_lines_from_file('/Users/bahn/work/wikitopics/data/clustering/pick/pick0127') date = datetime.date(2009, 10, 12) lang = 'en' sentences,tags = wpTextExtractor.wiki2sentences("<!-- See -->\n<!-- PLEASE DO NOT CHANGE OBAMA'S NAME -->", determine_splitter(lang), True) for s in sentences: print s sys.exit(0) #topics = ['Inauguration_of_Barack_Obama', 'Bill_Clinton', 'Black_Saturday_bushfires', 'Estradiol','Emma_Frost','Influenza','James','Brett_Favre'] topics = ['Barack_Obama'] shown = {} shown2 = {} shown3 = {} for article in topics: revid = wikipydia.query_revid_by_date(article, lang, date) print revid wikimarkup = wikipydia.query_text_raw_by_revid(revid, lang)['text'] sentences,tags = wpTextExtractor.wiki2sentences(wikimarkup, determine_splitter(lang), True) wikimarkup = '\n'.join(sentences)