示例#1
0
 def save(self, manually_splitting=False, source_sentences=()):       
     if not manually_splitting:
         # Tokenize the HTML that is fetched from a wiki article
         sentences = list()
         segment_id = 0
         soup = BeautifulSoup(self.source_text)
         sentence_splitter = determine_splitter(self.language)
         # initial save for foreign key based saves to work
         # save should occur after sent_detector is loaded
         super(SourceArticle, self).save()
         # find all paragraphs
         for p in soup.findAll('p'):
             only_p = p.findAll(text=True)
             p_text = ''.join(only_p)
             # split all sentences in the paragraph
             
             sentences = sentence_splitter(p_text.strip())
             # TODO: remove bad sentences that were missed above
             sentences = [s for s in sentences if not re.match("^\**\[\d+\]\**$", s)]
                 
             for sentence in sentences:
                 # Clean up bad spaces ( )
                 sentence = sentence.replace(" ", " ")
                 
                 s = SourceSentence(article=self, text=sentence, segment_id=segment_id)
                 segment_id += 1
                 s.save()
             s.end_of_paragraph = True
             s.save()
         self.sentences_processed = True
     else:
         for sentence in source_sentences:
             sentence.save()
     super(SourceArticle, self).save()
示例#2
0
 def save(self, manually_splitting=False):
     if manually_splitting:
         print 'woo'
     else:
         sentences = list()
         segment_id = 0
         soup = BeautifulSoup(self.source_text)
         sentence_splitter = determine_splitter(self.language)
         # initial save for foriegn key based saves to work
         # save should occur after sent_detector is loaded
         super(SourceArticle, self).save()
         for p in soup.findAll('p'):
             only_p = p.findAll(text=True)
             p_text = ''.join(only_p)
             for sentence in sentence_splitter(p_text.strip()):
                 s = SourceSentence(article=self,
                                    text=sentence,
                                    segment_id=segment_id)
                 segment_id += 1
                 s.save()
             s.end_of_paragraph = True
             s.save()
         self.sentences_processed = True
     print 'James :: %s' % self
     super(SourceArticle, self).save()
示例#3
0
    def save(self):
        sentences = list()
        segment_id = 0
        #soup = BeautifulSoup(self.source_text)
        sentence_splitter = determine_splitter(self.language)
        # initial save for foriegn key based saves to work
        # save should occur after sent_detector is loaded
        super(SourceArticle, self).save()
        #for p in soup.findAll('p'):
        #    only_p = p.findAll(text=True)
        #    p_text = ''.join(only_p)
        #    for sentence in sentence_splitter(p_text.strip()):
        #        s = SourceSentence(article=self, text=sentence, segment_id=segment_id)
        #        segment_id += 1
        #        s.save()
        #    s.end_of_paragraph = True
        import sys
        print >> sys.stderr, 'got here...'
        for sent,tag in zip(*wiki2sentences(self.source_text,sentence_splitter)):
            s = SourceSentence(article=self, text=sent, segment_id=segment_id)
            segment_id += 1
            if tag=='LastSentence':
                s.end_of_paragraph = True
            s.save()

        self.sentences_processed = True
        super(SourceArticle, self).save()
示例#4
0
    def save(self, manually_splitting=False, source_sentences=()):
        if not self.sentences_processed and not manually_splitting:
            soup = BeautifulSoup(self.source_text)
            sentence_splitter = determine_splitter(self.language.code)
            super(SourceArticle, self).save()

            segment_id = 0
            for tag in soup.findAll(re.compile('^[ph]')):
                if re.match('p', tag.name):
                    p_text = ''.join([x.string for x in tag.findAll(text=True)
                                      if not re.match('[\[\]\\d]+$',
                                                      x.string)])
                    sentences = sentence_splitter(p_text.strip())
                    for sentence in sentences:
                        sentence = sentence.replace(" ", " ")
                        src_sent = SourceSentence(article=self,
                                                  text=sentence,
                                                  segment_id=segment_id,
                                                  is_heading=False,
                                                  heading_level=0)
                        segment_id += 1
                        src_sent.save()
                    src_sent.end_of_paragraph = True
                    src_sent.save()

                elif re.match('h\d', tag.name):
                    headline = tag.findAll(attrs={'class': 'mw-headline'})
                    if headline:
                        content = headline[0].string
                    else:
                        content = tag.string
                    if content.lower() == 'weblinks':
                        break
                    src_sent = SourceSentence(article=self,
                                              text=content,
                                              segment_id=segment_id,
                                              is_heading=True,
                                              heading_level=int(tag.name[-1]))
                    src_sent.save()
                    segment_id += 1

            self.sentences_processed = True

        else:
            for sentence in source_sentences:
                sentence.save()

        super(SourceArticle, self).save()
示例#5
0
文件: models.py 项目: mhq/wikitrans
 def save(self):
     sentences = list()
     segment_id = 0
     soup = BeautifulSoup(self.source_text)
     sentence_splitter = determine_splitter(self.language)
     # initial save for foriegn key based saves to work
     # save should occur after sent_detector is loaded
     super(SourceArticle, self).save()
     for p in soup.findAll('p'):
         only_p = p.findAll(text=True)
         p_text = ''.join(only_p)
         for sentence in sentence_splitter(p_text.strip()):
             s = SourceSentence(article=self, text=sentence, segment_id=segment_id)
             segment_id += 1
             s.save()
         s.end_of_paragraph = True
         s.save()
     self.sentences_processed = True
     super(SourceArticle, self).save()
示例#6
0
    def save(self, manually_splitting=False, source_sentences=()):
        if not self.sentences_processed and not manually_splitting:
            # Tokenize the HTML that is fetched from a wiki article
            sentences = list()
            segment_id = 0
            soup = BeautifulSoup(self.source_text)
            sentence_splitter = determine_splitter(self.language.code)
            # initial save for foreign key based saves to work
            # save should occur after sent_detector is loaded
            super(SourceArticle, self).save()
            # find all paragraphs
            for p in soup.findAll('p'):
                only_p = p.findAll(text=True)
                p_text = ''.join(only_p)
                # split all sentences in the paragraph

                sentences = sentence_splitter(p_text.strip())
                # TODO: remove bad sentences that were missed above
                sentences = [
                    s for s in sentences if not re.match("^\**\[\d+\]\**$", s)
                ]

                for sentence in sentences:
                    # Clean up bad spaces ( )
                    sentence = sentence.replace(" ", " ")

                    s = SourceSentence(article=self,
                                       text=sentence,
                                       segment_id=segment_id)
                    segment_id += 1
                    s.save()
                s.end_of_paragraph = True
                s.save()
            self.sentences_processed = True
        else:
            for sentence in source_sentences:
                sentence.save()

        super(SourceArticle, self).save()
示例#7
0
def write_lines_to_file(output_filename, lines):
	"""                                                                                                                     \
	Writes a list of lines to file.                                                                                       \
	"""
	output_file = open(output_filename, 'w')
	for line in lines:
		output_file.write(line.encode('UTF-8'))
		output_file.write('\n'.encode('UTF-8'))
	output_file.close()
	return lines

#topics = read_lines_from_file('/Users/bahn/work/wikitopics/data/clustering/pick/pick0127')
date = datetime.date(2009, 10, 12)
lang = 'en'

sentences,tags = wpTextExtractor.wiki2sentences("<!-- See  -->\n<!-- PLEASE DO NOT CHANGE OBAMA'S NAME -->", determine_splitter(lang), True)
for s in sentences:
	print s
sys.exit(0)

#topics = ['Inauguration_of_Barack_Obama', 'Bill_Clinton', 'Black_Saturday_bushfires', 'Estradiol','Emma_Frost','Influenza','James','Brett_Favre']
topics = ['Barack_Obama']
shown = {}
shown2 = {}
shown3 = {}
for article in topics:
	revid = wikipydia.query_revid_by_date(article, lang, date)
	print revid
	wikimarkup = wikipydia.query_text_raw_by_revid(revid, lang)['text']
	sentences,tags = wpTextExtractor.wiki2sentences(wikimarkup, determine_splitter(lang), True)
	wikimarkup = '\n'.join(sentences)