def save(self): sentences = list() segment_id = 0 #soup = BeautifulSoup(self.source_text) sentence_splitter = determine_splitter(self.language) # initial save for foriegn key based saves to work # save should occur after sent_detector is loaded super(SourceArticle, self).save() #for p in soup.findAll('p'): # only_p = p.findAll(text=True) # p_text = ''.join(only_p) # for sentence in sentence_splitter(p_text.strip()): # s = SourceSentence(article=self, text=sentence, segment_id=segment_id) # segment_id += 1 # s.save() # s.end_of_paragraph = True import sys print >> sys.stderr, 'got here...' for sent,tag in zip(*wiki2sentences(self.source_text,sentence_splitter)): s = SourceSentence(article=self, text=sent, segment_id=segment_id) segment_id += 1 if tag=='LastSentence': s.end_of_paragraph = True s.save() self.sentences_processed = True super(SourceArticle, self).save()
def main(): optParse( trace__T=None, language__L='|'.join(l for p in languages for l in p), fromDump__D='', showType__S=None, withTags__W=None ) sent_detector = nltk.data.load('tokenizers/punkt/%s.pickle' % lang2long(options.language)).tokenize if options.fromDump: if options.fromDump.endswith('.gz'): source = os.popen('zcat %s' % options.fromDump) else: source = open(options.fromDump) currentLines = [] for line in source: line = line.strip() if line.startswith('<title>'): print line elif line.startswith('<text'): currentLines.append(line.split('>',1)[1]) elif currentLines: if line.endswith('</text>'): currentLines.append(line.rsplit('<',1)[0]) print '\n'.join(wiki2sentences('\n'.join(currentLines), sent_detector,False)) currentLines = [] else: currentLines.append(line) else: for title in arguments: if title == 'Barack Obama' and options.language=='en': text = open('obama.src').read().decode('utf-8') else: text = wikipydia.query_text_raw(title, language=lang2short(options.language))['text'] if options.withTags: for s,t in zip(*wiki2sentences(text,sent_detector,True)): print t[:4],s.encode('utf-8') else: print '\n'.join(wiki2sentences(text,sent_detector,False)).encode('utf-8')
def get_sentences_for_article(article, article_id, lang, sentence_filename, write_to_file=True): """ Converts the article to text, splits it into sentences. Appends the sentences to file """ wikimarkup = wikipydia.query_text_raw(article, lang)['text'] sentences,tags = wpTextExtractor.wiki2sentences(wikimarkup, determine_splitter(lang), True) if(write_to_file): sentences = write_lines_to_file(sentence_filename, sentences) tags = write_lines_to_file(sentence_filename + '.tags', tags) seg_ids = [] for i in range(0, len(sentences)): id = article_id + '_' + str(i) seg_ids.append(id) seg_ids = write_lines_to_file(sentence_filename + '.seg_ids', seg_ids) return sentences
def get_sentences_for_article(article, article_id, lang, sentence_filename, write_to_file=True): """ Converts the article to text, splits it into sentences. Appends the sentences to file """ wikimarkup = wikipydia.query_text_raw(article, lang)['text'] sentences, tags = wpTextExtractor.wiki2sentences(wikimarkup, determine_splitter(lang), True) if (write_to_file): sentences = write_lines_to_file(sentence_filename, sentences) tags = write_lines_to_file(sentence_filename + '.tags', tags) seg_ids = [] for i in range(0, len(sentences)): id = article_id + '_' + str(i) seg_ids.append(id) seg_ids = write_lines_to_file(sentence_filename + '.seg_ids', seg_ids) return sentences
def write_lines_to_file(output_filename, lines): """ \ Writes a list of lines to file. \ """ output_file = open(output_filename, 'w') for line in lines: output_file.write(line.encode('UTF-8')) output_file.write('\n'.encode('UTF-8')) output_file.close() return lines #topics = read_lines_from_file('/Users/bahn/work/wikitopics/data/clustering/pick/pick0127') date = datetime.date(2009, 10, 12) lang = 'en' sentences,tags = wpTextExtractor.wiki2sentences("<!-- See -->\n<!-- PLEASE DO NOT CHANGE OBAMA'S NAME -->", determine_splitter(lang), True) for s in sentences: print s sys.exit(0) #topics = ['Inauguration_of_Barack_Obama', 'Bill_Clinton', 'Black_Saturday_bushfires', 'Estradiol','Emma_Frost','Influenza','James','Brett_Favre'] topics = ['Barack_Obama'] shown = {} shown2 = {} shown3 = {} for article in topics: revid = wikipydia.query_revid_by_date(article, lang, date) print revid wikimarkup = wikipydia.query_text_raw_by_revid(revid, lang)['text'] sentences,tags = wpTextExtractor.wiki2sentences(wikimarkup, determine_splitter(lang), True) wikimarkup = '\n'.join(sentences)
import datetime import wpTextExtractor import sys date = datetime.date(int((sys.argv[2])[:4]), int((sys.argv[2])[5:7]), int((sys.argv[2])[8:10])) negatives = get_negative_controls(sys.argv[1], date, int(sys.argv[3])) for article in negatives: print article.replace("_", " "), '\t', print '-1', '\t', first_sentence = '' paragraph = '' text = query_text_raw(article,sys.argv[1])['text'] sentences, tags = wpTextExtractor.wiki2sentences(text, determine_splitter(sys.argv[1]), True) for sent, tag in zip(sentences, tags): if first_sentence == '': first_sentence = '1' print sent.encode('utf-8').rstrip(), '\t', print sent.encode('utf-8'), if tag == "LastSentence": break print "" #negatives = get_negative_controls(sys.argv[1], date, int(sys.argv[3])) #for article in negatives:
def get_lang_links_context(lang_links, lang, max_items=settings["top_links"], num_context_sentences=settings["num_context_sentences"]): #(articles, lang, lang_properties, num_context_sentences=settings["num_context_sentences"], max_articles=settings["top_articles"]): #build vocabulary based on list of articles and compile context sentences for each word """ Extracts all of the non-English vocabulary from each of the pages, and retains up to the specified number of context sentences. The vocab is normaized by lowercasing and stripping punctuation. """ logging.info("getting context for interlanguage links") #add all unicode punctuation categories for exclusion all_chars=(unichr(i) for i in xrange(0x10000)) punct=''.join(c for c in all_chars if unicodedata.category(c)[0]=='P') #punct_to_exclude= set(string.punctuation + "1234567890") punct_to_exclude= set(punct + "1234567890") links_with_context={} splitter=determine_splitter(lang) for i,en_article in enumerate(lang_links): logging.debug ("item # %s from %s, # of good links %s, # of links needed %s" % (i, en_article,len(links_with_context), max_items)) if len(links_with_context) >= max_items: break article = lang_links[en_article]["translation"] if use_as_gold_standard_translation(en_article, article, lang): logging.debug("link accepted %s - %s" % (en_article,article)) word = unicode(article, "UTF-8") try: wikimarkup = wikipydia.query_text_raw(article, lang)['text'] sentences,tags = wpTextExtractor.wiki2sentences(wikimarkup, splitter, True) for j,sentence in enumerate(sentences): if re.search(word, sentence): if not word in links_with_context: links_with_context[word] = {"context":[],"translation":en_article} if len(links_with_context[word]["context"]) < num_context_sentences: links_with_context[word]["context"].append(sentence) links_with_context[word]["translation"] = en_article else: break except KeyError: #logging.debug( u'no page for %s %s' % (article, lang)) print u'no page for ', article, lang except IOError: logging.debug( u'cannot reach %s %s' % (article, lang)) except TypeError: #logging.debug( u'unicode object error for %s %s' % (article, lang)) print 'unicode object error for', article, lang except UnicodeDecodeError: #logging.debug( u'unicode error for %s %s' % (article, lang)) print u'unicode error ', article, lang except: #logging.debug( u'somethign weird happened for %s %s' % (article, lang)) print u'somethign weird happened for ', article, lang else: logging.debug("link rejected %s - %s" % (en_article,article)) return links_with_context
def get_vocab(articles, lang, lang_properties, num_context_sentences=settings["num_context_sentences"], max_articles=settings["top_articles"]): #build vocabulary based on list of articles and compile context sentences for each word logging.info("generating vocabulary") #add all unicode punctuation categories for exclusion all_chars=(unichr(i) for i in xrange(0x10000)) punct=''.join(c for c in all_chars if unicodedata.category(c)[0]=='P') #punct_to_exclude= set(string.punctuation + "1234567890") punct_to_exclude= set(punct + "1234567890") vocab={} num_articles=0 splitter=determine_splitter(lang) for i,article in enumerate(articles): try: wikimarkup = wikipydia.query_text_raw(articles[i], lang)['text'] sentences,tags = wpTextExtractor.wiki2sentences(wikimarkup, splitter, True) for sentence in sentences: sent = ''.join(ch for ch in sentence if ch not in punct_to_exclude) sent = sent.lower() words = sent.split(' ') for word in words: # filter words that are obviously non foreighn language (plain english or gibberish/non-alpha) #if not word in en_vocab: if len(word)<settings["min_letters"]: break if not word in vocab: vocab[word] = {"frequency":1,"context":[]} else: vocab[word]["frequency"]=vocab[word]["frequency"]+1 if len(vocab[word]["context"]) < num_context_sentences: vocab[word]["context"].append(sentence) num_articles = num_articles + 1 if num_articles >= max_articles: break except KeyError: #logging.debug( u'no page for %s %s' % (article, lang)) print u'no page for ', article, lang except IOError: #logging.debug( u'cannot reach %s %s' % (article, lang)) print u'cannot reach ', article, lang except TypeError: #logging.debug( u'unicode object error for %s %s' % (article, lang)) print u'unicode object error for ', article, lang except UnicodeDecodeError: #logging.debug( u'unicode error for %s %s' % (article, lang)) print u'unicode error for ', article, lang except: #logging.debug( u'somethign weird happened for %s %s' % (article, lang)) print u'somethign weird happened for ', article, lang logging.info("vocabulary size: %s" % (len(vocab))) return vocab
def fetch_articles_on_date(topics, date, lang, output_dir, upperlimit, dryrun, retry=5, wait=5): if os.path.exists(output_dir): if not os.path.isdir(output_dir): sys.stderr.write(output_dir + " is not a directory\n") sys.exit(1) else: os.makedirs(output_dir) mark = {} success = 0 articles = {} mark = {} for article, values in topics.items(): if success >= upperlimit: break title = article # resolve redirects if not wikipydia.query_exists(title, lang): continue title = wikipydia.query_redirects(title, lang).replace(' ','_') if title in mark: continue mark[title] = True # the file prefix for output files file_prefix = urllib.quote(title.replace(' ','_').encode('utf8'), safe="%") # force / to be quoted and % not to be quoted if file_prefix.startswith('.'): file_prefix = "%2E" + file_prefix[1:] if dryrun: print file_prefix success += 1 continue done = False no_retry = 0 while not done and no_retry < retry: try: revid = values['thenid'] if revid == 0: revid = wikipydia.query_revid_by_date_fallback(title, lang, date) wikimarkup = wikipydia.query_text_raw_by_revid(revid, lang)['text'] done = True except: no_retry += 1 time.sleep(wait) if not wikimarkup: print 'Retrieving', title, 'failed' print 'RevID:', revid print 'Date:', date.isoformat() continue try: sentences, tags, citations = wpTextExtractor.wiki2sentences(wikimarkup, determine_splitter(lang), True, True) except: sys.stdout.flush() sys.stdout.write('Failed retrieving the text from ' + title + '\n') traceback.print_exc() sys.stdout.flush() continue # substitute angle brackets with html-like character encodings #sentences = [re.sub('<', '<', re.sub('>', '>', s)) for s in sentences] #sentences.insert(0, urllib.unquote(file_prefix.replace('_',' ')) + '.') output_filename = os.path.join(output_dir, file_prefix + '.sentences') output = write_lines_to_file(output_filename, sentences) output_filename = os.path.join(output_dir, file_prefix + '.tags') output = write_lines_to_file(output_filename, tags) success += 1 priorid = values['priorid'] if priorid == 0: priorid = wikipydia.query_revid_by_date_fallback(title, lang, date - datetime.timedelta(days=15)) articles[title] = {'score': values['score'], 'thenid': revid, 'priorid': priorid} sys.stderr.write('.') sys.stderr.write('\n') if not dryrun: if len(articles) > 1 or (len(articles) == 1 and output_dir != '.'): write_articles(articles, topics, os.path.join(output_dir, date.strftime('%Y-%m-%d') + '.articles.list'))