def naivepatternHarvester(title, propertyWorder, wikipediaDump, naivePredicateStatistics, naiveSubjectStatistics): language = propertyWorder.getLanguage() print('Working on `' + title + '`') # Get triples from DBPedia print_n_flush('Querying DBPedia...') iri = '<' + namespaces['dbpedia'] + title + '>' dbpediaData = fetchSubjectTriples(iri, language) # End of DBPedia get triples print 'OK' sourceWiki = language # Retrieve Wikipedia article print_n_flush('Retrieving article from Wikipedia...') # Obtain a pattern graph for the subject titleLabelSingleton = getPredicateValues(dbpediaData, 'rdfs:label') # We are pretty sure right now extendendSubjectLabels is a singleton (i.e. there is only one triple for predicate rdfs:label) try: assert (len(titleLabelSingleton) == 1) except: return titleLabel = iter(titleLabelSingleton).next() # We don't do this anymore # text = getCurrentWikiArticleText(sourceWiki, title) # We do this instead try: text = wikipediaDump.get_page_contents_by_title( unidecode(titleLabel)).decode('utf-8') except KeyError: print_n_flush('\nCould not find a page with this title: "' + unidecode(titleLabel) + '", skipping') return # End of Wikipedia article retrieving print 'OK' # Remove wiki markup print_n_flush('Getting rid of wiki markup...') # Preliminary wiki markup cleanup text = WikiExtractor.clean(text) # Final wiki markup cleanup (turning text into a list of section titles and paragraphs) text = WikiExtractor.compact(text) # End of wiki markup cleaning print 'OK' mergedText = u' '.join(text) naivepatterns.naiveStatistics(title, mergedText, dbpediaData, propertyWorder, naivePredicateStatistics, naiveSubjectStatistics, 3, False)
def get_kb_description(self, topic_title): raw_content = wikipedia_api_util.get_raw_page_text(topic_title) cleaned = WikiExtractor.clean(raw_content) compacted = WikiExtractor.compact(cleaned) desc = ' '.join(compacted) if desc is None or desc.strip() == '': return topic_title return desc
def cleaner_Both(wikidoc): '''Use WikiExtractor for cleaning Use Parser from hell for links ''' wikidoc.clean_text = WikiExtractor.clean(wikidoc.wiki_text) wp = WikiTextProcessor(wikidoc.wiki_text) wikidoc.meta[WdNames.LINKS] = wp.get_links() return wikidoc
def get_kb_description(self, topic_title): raw_content = wikipedia_api_util.get_raw_page_text(topic_title) cleaned = WikiExtractor.clean(raw_content) compacted = WikiExtractor.compact(cleaned) desc = ' '.join(compacted) if desc is None or desc.strip()=='': return topic_title return desc
def cleaner_Both(wikidoc): '''Use WikiExtractor for cleaning Use Parser from hell for links ''' wikidoc.clean_text = WikiExtractor.clean(wikidoc.wiki_text) wp = WikiTextProcessor(wikidoc.wiki_text) wikidoc.meta[WdNames.LINKS] = wp.get_links() return wikidoc
def clean_markups(self, text): if not text: return "" clean_text = WikiExtractor.clean(text) clean_frags = WikiExtractor.compact(clean_text) clean_html = [re.sub(HTML_TAG_REGEX, '', frag) for frag in clean_frags] return "\n".join(clean_html) if len(clean_html) > 0 else ""
def clean_markups(self, text): if not text: return "" clean_text = WikiExtractor.clean(text) clean_frags = WikiExtractor.compact(clean_text) clean_html = [re.sub(HTML_TAG_REGEX, '', frag) for frag in clean_frags] return "\n".join(clean_html) if len(clean_html) > 0 else ""
def remove_markup(self): # First fix wiktioanry links that aren't being handled properly # by the WikiExtractor library. wikt = r"\[{2,}wikt:[^\|]+\|([^\]]+)\]{2,}" self.text = re.sub(wikt, r'\1', self.text) broken_wikt = r"{{broken wikt link\|([^\|}]+)(?:\|([^}]+))?}{2,}" self.text = re.sub(broken_wikt, r'\1', self.text) # Use the WikiExtractor library to finish processing self.text = WikiExtractor.clean(self.text) self.text = '\n'.join(WikiExtractor.compact(self.text))
def remove_markup(self): """Remove wiki markup leaving just the plain-text.""" # First fix wiktioanry links that aren't being handled properly # by the WikiExtractor library. wikt = r"\[{2,}wikt:[^\|]+\|([^\]]+)\]{2,}" self.text = re.sub(wikt, r'\1', self.text) broken_wikt = r"{{broken wikt link\|([^\|}]+)(?:\|([^}]+))?}{2,}" self.text = re.sub(broken_wikt, r'\1', self.text) # Use the WikiExtractor library to finish processing self.text = WikiExtractor.clean(self.text) self.text = '\n'.join(WikiExtractor.compact(self.text))
def page_handler(page): global db_cursor global db try: if 'redirect' in page: synonym_data = { 'synonym': page['title'] + ';', 'redirect': page['redirect'] } db_cursor.execute( """ UPDATE articles SET synonyms = IFNULL(CONCAT(synonyms, %(synonym)s), %(synonym)s) WHERE title = %(redirect)s """, synonym_data) #print('Number of rows inserted: %d' % db_cursor.rowcount) db.commit() return """Write the right bits to the right files.""" #print(page['title']) #print(page['title']) #print("page_id :",page['id']) #print(page['redirect']) #print("time :",page['revisions'][-1]['timestamp']) text = HTMLParser.HTMLParser().unescape( page['revisions'][-1]['text']) text = ''.join(BeautifulSoup(text).findAll(text=True)) text = WikiExtractor.clean(text) text = ''.join(WikiExtractor.compact(text)) #print(text) article_data = { 'id': page['id'], 'title': page['title'], 'timestamp': page['revisions'][-1]['timestamp'], 'text': text } print(page['id']) db_cursor.execute( """ INSERT INTO articles(id, title, timestamp, text) VALUES (%(id)s, %(title)s, %(timestamp)s, %(text)s) """, article_data) #print('Number of rows inserted: %d' % db_cursor.rowcount) db.commit() except Exception, e: print >> sys.stderr, "invoked error. id : %s, %s" % (page['id'], e)
def page_handler(page): global db_cursor global db try: if 'redirect' in page: synonym_data = { 'synonym': page['title'] + ';', 'redirect': page['redirect'] } db_cursor.execute(""" UPDATE articles SET synonyms = IFNULL(CONCAT(synonyms, %(synonym)s), %(synonym)s) WHERE title = %(redirect)s """, synonym_data) #print('Number of rows inserted: %d' % db_cursor.rowcount) db.commit() return """Write the right bits to the right files.""" #print(page['title']) #print(page['title']) #print("page_id :",page['id']) #print(page['redirect']) #print("time :",page['revisions'][-1]['timestamp']) text = HTMLParser.HTMLParser().unescape(page['revisions'][-1]['text']) text = ''.join(BeautifulSoup(text).findAll(text=True)) text = WikiExtractor.clean(text) text = ''.join(WikiExtractor.compact(text)) #print(text) article_data = { 'id': page['id'], 'title': page['title'], 'timestamp': page['revisions'][-1]['timestamp'], 'text': text } print(page['id']) db_cursor.execute(""" INSERT INTO articles(id, title, timestamp, text) VALUES (%(id)s, %(title)s, %(timestamp)s, %(text)s) """, article_data) #print('Number of rows inserted: %d' % db_cursor.rowcount) db.commit() except Exception, e: print >> sys.stderr, "invoked error. id : %s, %s" % (page['id'], e)
def get_title(squery): #Gets the wiki page for the title squery. url = "http://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=json&"+str(urllib.urlencode({"titles":squery})) while True: try: dat = urllib.urlopen(url).read() dat = json.loads(dat) break except ValueError: #Proxy might reject request. print "Retrying" pass kys = dat['query']['pages'].keys()[0] dat = dat['query']['pages'][kys]['revisions'][-1]['*'] redir = re.findall("#REDIRECT \[\[(.*?)\]\]",dat,re.IGNORECASE) #Handle Redirection. if(len(redir)!=0): return get_title(redir[0]) print "Retrieved " + str(squery) dat2 = we.clean(dat) return dat2 #TWEET_LENGTH word Tweet.
def cleaner_WikiExtractor(wikidoc): wikidoc.clean_text = WikiExtractor.clean(wikidoc.wiki_text) return wikidoc
def cleaner_WikiExtractor(wikidoc): wikidoc.clean_text = WikiExtractor.clean(wikidoc.wiki_text) return wikidoc
def sampler(title, propertyWorder, wikipediaDump, sampleSentences): language = propertyWorder.getLanguage() print('Working on `%s`' % title) projectedTitle = unquote(title.replace('_',' ')).decode(encoding='utf-8') primaryTitleLabels = {projectedTitle} print('Going with "%s"' % (projectedTitle)) titleLabel = primaryTitleLabels.pop() ############################################################################ # Retrieve article for subject # print_n_flush('Retrieving article from Wikipedia...') # We do this instead, fetching the article from the wikipedia dump strTitleLabel = unidecode(titleLabel) try: rawArticle = wikipediaDump.get_page_contents_by_title(strTitleLabel) except KeyError: message = "Could not fetch the article for " + titleLabel logging.warning(message) print(message) return article = rawArticle.decode('utf-8') print 'OK' ### Expand relevant templates in the Wikipedia article print_n_flush('Expanding relevant templates...') article = removeSectionTitles(article) article = expandTemplates(article, propertyWorder) print 'OK' #END# Templates expansion ### Wiki markup cleaning print_n_flush('Getting rid of wiki markup...') # Preliminary cleanup article = WikiExtractor.clean(article) # Final cleanup (turning text into a list of section titles and paragraphs) article = WikiExtractor.compact(article) print 'OK' #END# Wiki markup cleaning for paragraph in article: """ Account for a bug in the PunktSentenceTokenizer when handling sentence-ending marks followed by a double quote mark """ paragraph = paragraph.replace('?"', '? "') paragraph = paragraph.replace('!"', '! "') paragraph = paragraph.replace('."', '. "') #TODO: Language-agnostic sentence tokenizer sentences = tokenize_sentence(paragraph) for sentence in sentences: sentence = propertyWorder.adjustText(sentence) sampleSentences.append(sentence)
from time import strftime from jsonrpclib.jsonrpc import ProtocolError import logging import sys from pairseslib.pickling import pickleDump, pickleLoad if __name__ == '__main__': logging.basicConfig(filename=os.path.join(cfg['home'], 'wikiread.log'), level=logging.DEBUG, format=cfg['logtimestampformat']) # Open the Wikipedia dump through wikidump wikipediaDump = wikiModel.Dump( '/Volumes/Data/wikidump/enwiki-20130304-pages-articles.xml', False, False) # Instantiate the English Wikipedia worder propertyWorder = EnglishWikipediaModule() text = wikipediaDump.get_page_contents_by_title('Bern').decode('utf-8') text = expandTemplates(text, propertyWorder) # Preliminary wiki markup cleanup text = WikiExtractor.clean(text) # Final wiki markup cleanup (turning text into a list of section titles and paragraphs) text = WikiExtractor.compact(text) for line in text: print(line.encode('utf-8'))
def patternHarvester(title, propertyWorder, wikipediaDump): language = propertyWorder.getLanguage() sourceWiki = language print('Working on `%s`' % title) ############################################################################ # Fetch triples for subject # print_n_flush('Querying DBPedia...') subjectIRI = expandIRI('dbpedia:' + title) subjectTriples = fetchSubjectTriples(subjectIRI, language, False, False) print 'OK' # End of "Fetch triples for subject" # ############################################################################ # Obtain title for the article (i.e. primary subject name) primaryTitleLabels = getValuesForPredicate(subjectTriples, 'rdfs:label') # We are pretty sure right now is a singleton # (i.e. there is one triple for predicate rdfs:label) try: assert (len(primaryTitleLabels) == 1) except: projectedTitle = title.replace('_', ' ') primaryTitleLabels = {unicode(projectedTitle)} message = "Could not find a primary label for %s, will try %s" % ( title, projectedTitle) print(message) titleLabel = primaryTitleLabels.pop() ############################################################################ # Retrieve article for subject # print_n_flush('Retrieving article from Wikipedia...') # We don't do this anymore # article = getCurrentWikiArticleText(sourceWiki, title) # We do this instead, fetching the article from the wikipedia dump strTitleLabel = unidecode(titleLabel) try: rawArticle = wikipediaDump.get_page_contents_by_title(strTitleLabel) except KeyError: message = "Could not fetch the article for " + titleLabel logging.warning(message) print(message) return article = rawArticle.decode('utf-8') print 'OK' # End of "Retrieve article for subject" ############################################################################ subjectWordings = set() subjectWordings.add(titleLabel) # Retrieve secondary names (obtained from redirects to the primary article) # and add them as subject labels subjectWordings |= otherLabels(subjectIRI, language) # Filter and get the labels for the classes the subject is an instance of # (e.g. Los Angeles would have "city" as a label to an object for a # rdf:type triple) subjectClasses = getLabelsForPredicate(subjectTriples, 'rdf:type') wordedClassLabels = set() for classLabel in subjectClasses: captlzd, uncptlzd = propertyWorder.getClassLabelWording(classLabel) wordedClassLabels.add(uncptlzd) wordedClassLabels.add(captlzd) subjectWordings |= wordedClassLabels ### Compute and annotate wordings for triple objects annotatedSubjectWordings = list() # Cycle through all wordings for the subject and get an annotation # for each one for subjectWording in subjectWordings: try: (root, words, graph) = annotateText(subjectWording) except AnnotationError: continue annotatedSubjectWordings.append((subjectWording, (root, words, graph))) ### Compute and annotate wordings for objects in each triple print_n_flush('Finding and annotating wordings for triple objects...') annotatedObjectWordings = list() predicateOccurrences = dict() for triple in subjectTriples: predicate = triple['p']['value'] if predicate in ignored: continue if predicate not in predicateOccurrences: predicateOccurrences[predicate] = set() try: objectWording = getCommonWording(triple, propertyWorder) except CommonWordingNotFound: # TODO: Find out if any important data types are left out """ if triple['p']['value'] not in notWorded: notWorded[triple['p']['value']] = list() notWorded[triple['p']['value']].append(triple) pprint(triple['p']['value'] + '::' + triple['o']['value']) pprint(triple) """ continue try: (root, words, graph) = annotateText(objectWording) except AnnotationError: continue annotatedObjectWordings.append( (objectWording, (root, words, graph, predicate))) ### END of templates expansion print 'OK' ### Expand relevant templates in the Wikipedia article print_n_flush('Expanding relevant templates...') article = expandTemplates(article, propertyWorder) print 'OK' #END# Templates expansion ### Wiki markup cleaning print_n_flush('Getting rid of wiki markup...') # Preliminary cleanup article = WikiExtractor.clean(article) # Final cleanup (turning text into a list of section titles and paragraphs) article = WikiExtractor.compact(article) print 'OK' #END# Wiki markup cleaning # Sentence counter i = 0 j = -1 for paragraph in article: """ Account for a bug in the PunktSentenceTokenizer when handling sentence-ending marks followed by a double quote mark """ paragraph = paragraph.replace('?"', '? "') paragraph = paragraph.replace('!"', '! "') paragraph = paragraph.replace('."', '. "') #TODO: Language-agnostic sentence tokenizer sentences = tokenize_sentence(paragraph) for sentence in sentences: sentence = propertyWorder.adjustText(sentence) # Statistics for ow, (owRootWord, owWords, owGraph, predicate) in annotatedObjectWordings: if ow in sentence: predicateOccurrences[predicate].add(ow) i += 1 # Get the graph for this sentence print_n_flush('PS') # Parse the sentence through the Stanford NLP Core Tools try: (sentenceR, sentenceW, sentenceG, sentence, sentenceWData) = annotateText(sentence, True) except AnnotationError: continue legalNodeIndices = map(lambda x: int(x[x.rindex("-") + 1:]), sentenceG.nodes()) rootNode = 'ROOT-0' # From here on, the initials "sw" refer to "subject wording" for sw, (swRootWord, swWords, swGraph) in annotatedSubjectWordings: try: swRootWordIndex = matchWording(sentence, sentenceW, sentenceG, legalNodeIndices, sentenceWData, sw, swWords, swGraph, swRootWord) except ValueError as e: """No match found for wording in sentence""" continue subjectTarget = swRootWord + '-' + unicode(swRootWordIndex) # Compute and generate subgraph for shortest path to Subject # s1 will be the nodes from root to subject try: s1 = set( shortestPathFromRoot(sentence, sentenceG, subjectTarget)) except ShortestPathError: continue # From here on, the initials "ow" refer to "object wording" # Compute and generate subgraph for shortest path to Object # s2 is the set of nodes from root to object for ow, (owRootWord, owWords, owGraph, predicate) in annotatedObjectWordings: try: owRootWordIndex = matchWording( sentence, sentenceW, sentenceG, legalNodeIndices, sentenceWData, ow, owWords, owGraph, owRootWord) except ValueError as e: """No match found for wording in sentence""" continue objectTarget = owRootWord + '-' + unicode(owRootWordIndex) if objectTarget == subjectTarget: """ No use for this kind of pattern """ continue try: s2 = set( shortestPathFromRoot(sentence, sentenceG, objectTarget)) except ShortestPathError: continue # At this point, we definitely have a pattern # Nodes in the spanning tree comprising solely the shortest # paths to the subject and to the object s = s1 | s2 # S is the aforementioned spanning tree S = nx.DiGraph(sentenceG.subgraph(s), name=predicate) anonRoot = unicode(cfg['roottag'] + '-0') anonSubject = unicode(cfg['subjecttag'] + '-' + unicode(swRootWordIndex)) anonObject = unicode(cfg['objecttag'] + '-' + unicode(owRootWordIndex)) renamings = dict() renamings[rootNode] = anonRoot renamings[subjectTarget] = anonSubject renamings[objectTarget] = anonObject entities = list() numerals = 0 try: for node in S.nodes(): if node not in renamings.keys(): if propertyWorder.partOfProperNoun(node): """ The word may refer to an entity, in this case let's abstract from the word and save a relation for this pattern""" index = int(node[node.rindex('-') + 1:]) anonEntity = '%s%05d-%d' % ( cfg['entitytagprefix'], len(entities), index) renamings[node] = anonEntity entityWording = associatedWording( sentence, node, sentenceG, sentenceWData, allowNestedWordingMatch=True) entities.append( (entityWording, getClasses(entityWording, language))) elif isNumeric(node): index = int(node[node.rindex('-') + 1:]) anonNumeral = '%s%05d-%d' % ( cfg['numerictagprefix'], numerals, index) numerals += 1 renamings[node] = anonNumeral except AnnotationError: continue # First anonymize subject, object and entities S = nx.relabel_nodes(S, renamings) # Remove indices as well indexlessNodes = map(lambda word: word[0:word.rindex("-")], S.nodes()) S = nx.relabel_nodes(S, dict(zip(S.nodes(), indexlessNodes))) if '' in S.nodes(): """ A bug in either the SCNLP or the python wrapper makes empty nodes out of schwas and other unicode chars that might be used as a diacritic""" # TODO: Find a fix for this message = 'Invalid dependencies for this sentence: ' + sentence logging.warning(message) print(message) continue # DOT representation of the graph pydotS = nx.to_pydot(S).to_string().encode( encoding='UTF-8', errors='strict') pattern = Pattern(pydotS, predicate, entities, title, sw, ow, sentence) try: saveGraph(S, pattern.hash) except (TypeError, UnicodeEncodeError): # TODO: Fix this "TypeError: coercing to # Unicode: need string or buffer, NoneType found" error # also : "UnicodeEncodeError: 'ascii' codec can't encode character" checkLog = True logging.warning('A graph could not be saved: ' 'Sentence: ' + sentence + 'Nodes: ' + str(S.nodes()) + 'Edges: ' + str(S.edges(data=True))) pass storePredicateOccurrences(title, predicateOccurrences)