def naivepatternHarvester(title, propertyWorder, wikipediaDump, naivePredicateStatistics, naiveSubjectStatistics): language = propertyWorder.getLanguage() print('Working on `' + title + '`') # Get triples from DBPedia print_n_flush('Querying DBPedia...') iri = '<' + namespaces['dbpedia'] + title + '>' dbpediaData = fetchSubjectTriples(iri, language) # End of DBPedia get triples print 'OK' sourceWiki = language # Retrieve Wikipedia article print_n_flush('Retrieving article from Wikipedia...') # Obtain a pattern graph for the subject titleLabelSingleton = getPredicateValues(dbpediaData, 'rdfs:label') # We are pretty sure right now extendendSubjectLabels is a singleton (i.e. there is only one triple for predicate rdfs:label) try: assert (len(titleLabelSingleton) == 1) except: return titleLabel = iter(titleLabelSingleton).next() # We don't do this anymore # text = getCurrentWikiArticleText(sourceWiki, title) # We do this instead try: text = wikipediaDump.get_page_contents_by_title( unidecode(titleLabel)).decode('utf-8') except KeyError: print_n_flush('\nCould not find a page with this title: "' + unidecode(titleLabel) + '", skipping') return # End of Wikipedia article retrieving print 'OK' # Remove wiki markup print_n_flush('Getting rid of wiki markup...') # Preliminary wiki markup cleanup text = WikiExtractor.clean(text) # Final wiki markup cleanup (turning text into a list of section titles and paragraphs) text = WikiExtractor.compact(text) # End of wiki markup cleaning print 'OK' mergedText = u' '.join(text) naivepatterns.naiveStatistics(title, mergedText, dbpediaData, propertyWorder, naivePredicateStatistics, naiveSubjectStatistics, 3, False)
def get_kb_description(self, topic_title): raw_content = wikipedia_api_util.get_raw_page_text(topic_title) cleaned = WikiExtractor.clean(raw_content) compacted = WikiExtractor.compact(cleaned) desc = ' '.join(compacted) if desc is None or desc.strip()=='': return topic_title return desc
def get_kb_description(self, topic_title): raw_content = wikipedia_api_util.get_raw_page_text(topic_title) cleaned = WikiExtractor.clean(raw_content) compacted = WikiExtractor.compact(cleaned) desc = ' '.join(compacted) if desc is None or desc.strip() == '': return topic_title return desc
def run(path_to_dump, wiki_files_dir='wiki', path_to_res='res_wiki.csv', workers_num=3): # step #1 - process files via wikiextractor argv = shlex.split(f'-o {wiki_files_dir} --json --processes {workers_num} {path_to_dump}') sys.argv = [sys.argv[0]] + argv print(argv) WikiExtractor.main() # step #2 - postporcessing postprocessing.run(wiki_files_dir, path_to_res, workers_num)
def clean_markups(self, text): if not text: return "" clean_text = WikiExtractor.clean(text) clean_frags = WikiExtractor.compact(clean_text) clean_html = [re.sub(HTML_TAG_REGEX, '', frag) for frag in clean_frags] return "\n".join(clean_html) if len(clean_html) > 0 else ""
def remove_markup(self): # First fix wiktioanry links that aren't being handled properly # by the WikiExtractor library. wikt = r"\[{2,}wikt:[^\|]+\|([^\]]+)\]{2,}" self.text = re.sub(wikt, r'\1', self.text) broken_wikt = r"{{broken wikt link\|([^\|}]+)(?:\|([^}]+))?}{2,}" self.text = re.sub(broken_wikt, r'\1', self.text) # Use the WikiExtractor library to finish processing self.text = WikiExtractor.clean(self.text) self.text = '\n'.join(WikiExtractor.compact(self.text))
def remove_markup(self): """Remove wiki markup leaving just the plain-text.""" # First fix wiktioanry links that aren't being handled properly # by the WikiExtractor library. wikt = r"\[{2,}wikt:[^\|]+\|([^\]]+)\]{2,}" self.text = re.sub(wikt, r'\1', self.text) broken_wikt = r"{{broken wikt link\|([^\|}]+)(?:\|([^}]+))?}{2,}" self.text = re.sub(broken_wikt, r'\1', self.text) # Use the WikiExtractor library to finish processing self.text = WikiExtractor.clean(self.text) self.text = '\n'.join(WikiExtractor.compact(self.text))
def page_handler(page): global db_cursor global db try: if 'redirect' in page: synonym_data = { 'synonym': page['title'] + ';', 'redirect': page['redirect'] } db_cursor.execute( """ UPDATE articles SET synonyms = IFNULL(CONCAT(synonyms, %(synonym)s), %(synonym)s) WHERE title = %(redirect)s """, synonym_data) #print('Number of rows inserted: %d' % db_cursor.rowcount) db.commit() return """Write the right bits to the right files.""" #print(page['title']) #print(page['title']) #print("page_id :",page['id']) #print(page['redirect']) #print("time :",page['revisions'][-1]['timestamp']) text = HTMLParser.HTMLParser().unescape( page['revisions'][-1]['text']) text = ''.join(BeautifulSoup(text).findAll(text=True)) text = WikiExtractor.clean(text) text = ''.join(WikiExtractor.compact(text)) #print(text) article_data = { 'id': page['id'], 'title': page['title'], 'timestamp': page['revisions'][-1]['timestamp'], 'text': text } print(page['id']) db_cursor.execute( """ INSERT INTO articles(id, title, timestamp, text) VALUES (%(id)s, %(title)s, %(timestamp)s, %(text)s) """, article_data) #print('Number of rows inserted: %d' % db_cursor.rowcount) db.commit() except Exception, e: print >> sys.stderr, "invoked error. id : %s, %s" % (page['id'], e)
def page_handler(page): global db_cursor global db try: if 'redirect' in page: synonym_data = { 'synonym': page['title'] + ';', 'redirect': page['redirect'] } db_cursor.execute(""" UPDATE articles SET synonyms = IFNULL(CONCAT(synonyms, %(synonym)s), %(synonym)s) WHERE title = %(redirect)s """, synonym_data) #print('Number of rows inserted: %d' % db_cursor.rowcount) db.commit() return """Write the right bits to the right files.""" #print(page['title']) #print(page['title']) #print("page_id :",page['id']) #print(page['redirect']) #print("time :",page['revisions'][-1]['timestamp']) text = HTMLParser.HTMLParser().unescape(page['revisions'][-1]['text']) text = ''.join(BeautifulSoup(text).findAll(text=True)) text = WikiExtractor.clean(text) text = ''.join(WikiExtractor.compact(text)) #print(text) article_data = { 'id': page['id'], 'title': page['title'], 'timestamp': page['revisions'][-1]['timestamp'], 'text': text } print(page['id']) db_cursor.execute(""" INSERT INTO articles(id, title, timestamp, text) VALUES (%(id)s, %(title)s, %(timestamp)s, %(text)s) """, article_data) #print('Number of rows inserted: %d' % db_cursor.rowcount) db.commit() except Exception, e: print >> sys.stderr, "invoked error. id : %s, %s" % (page['id'], e)
def format_wikicorpus(input, output, bytes, num_process, num_out_files): if input is None: raise ValueError('input file is empty.') if not input.endswith('xml.bz2'): raise ValueError('input file not *.xml.bz2.') if not os.path.exists(output): os.makedirs(output) # Use WikiExtractor to extract the content WikiExtractor = try_import_wikiextractor() wiki_path = os.path.join(output, 'extracted') sys.argv = ['prog', '-b', bytes, '-o', wiki_path, input] WikiExtractor.main() # Merge extracted content into txt files prepared_path = os.path.join(output, 'prepared_wikipedia') if not os.path.exists(prepared_path): os.makedirs(prepared_path) filenames = get_formatting_list(wiki_path, recursive=True) num_files = len(filenames) num_out_files = min(num_out_files, num_files) file_volume = math.ceil(num_files / num_out_files) splited_files = [ filenames[i:i + file_volume] for i in range(0, num_files, file_volume) ] num_out_files = len(splited_files) output_files = [ os.path.join(prepared_path, "wikipedia-prepared-{}.txt".format(str(i).zfill(4))) for i in range(num_out_files) ] print("All prepared raw text will be saved in {} txt files".format( num_out_files)) num_process = min(num_process, num_out_files) print('Start preprocessing {} text files with {} cores'.format( num_files, num_process)) process_args = [(splited_files[i], output_files[i]) for i in range(num_out_files)] start_time = time.time() with multiprocessing.Pool(num_process) as pool: f_read = 0 for i, _ in enumerate(pool.imap(merge, process_args)): elapsed = time.time() - start_time f_read += len(splited_files[i]) print( "prepared {:} files, Elapsed: {:.2f}s, ETA: {:.2f}s, ".format( f_read, elapsed, (num_files - f_read) / (num_files / elapsed))) print("Done preparation within {:.2f} seconds".format(elapsed))
def cleaner_Both(wikidoc): '''Use WikiExtractor for cleaning Use Parser from hell for links ''' wikidoc.clean_text = WikiExtractor.clean(wikidoc.wiki_text) wp = WikiTextProcessor(wikidoc.wiki_text) wikidoc.meta[WdNames.LINKS] = wp.get_links() return wikidoc
def go(name, date, cache=False): query_fmt = \ 'http://en.wikipedia.org/w/api.php/w/api.php?' \ 'action=query' \ '&format=json' \ '&prop=revisions' \ '&list=' \ '&pageids={}' \ '&rvsection=0' \ '&rvprop=timestamp%7Ccontent' \ '&rvstart={:04d}-{:02d}-{:02d}T00%3A00%3A00.000Z' # format the query timestamp = "{}-{}-{}".format(date.year, date.month, date.day) query = query_fmt.format(page_ids[name], date.year, date.month, date.day) sentiment_result = SentimentResult(name, page_ids[name], timestamp) # check to see if we actually need to perform the lookup if cache and sentiment_result.is_cached: sentiment_result.sync() return sentiment_result # if we don't have it in the cache, perform the query data = json.loads(requests.get(query).text) # parse the result with BeautifulSoup wiki_markup = data['query']['pages'][str(page_ids[name])]['revisions'][0]['*'] def format(text): lines = text.split('\n') return ' '.join([i for i in lines if i][1:-1]) # extract readable text from the markup extractor = WikiExtractor.Extractor(page_ids[name], 0, name, wiki_markup) sio = io.StringIO() extractor.extract(sio) sio.seek(0) text = format(sio.read()) # score the result with Google's sentiment analysis score, magnitude = analyze(text) sentiment_result.score = score sentiment_result.magnitude = magnitude sentiment_result.length = len(text) # cache to a file, if necessary if cache: sentiment_result.cache() return sentiment_result
def extractCleanText(page, anchorID, english27, Title_ID_All, Redirect_Hashmap, In_Link_Graph_Degree, min_degree): """ extract the clean text from Wikipedia page (tag <text> of the dump) @param page: Wikipedia page (tag <text> of the dump) @param anchor: If ture the anchor ID will be replace unless the surface form will be replaced in the internal links @return: clean text of Wikipedia page """ page = WikiExtractor.cleanText(page, anchorID, Title_ID_All, Redirect_Hashmap, In_Link_Graph_Degree, min_degree, english27) if english27: page = clean27English( page) # Convert to 27 English ASCII char plus space return page
def renderRevision(rev, title): """Renders revision dictionary in HTML/WikiMarkup into plaintext. TODO Html conversion!""" if (rev["*"] != None): if (rev["format"] == "wikimarkup"): text = rev["*"] out = io.StringIO() extractor = WikiExtractor.Extractor(0, 0, title, text.split("\n")) extractor.extract(out) rev["*"] = out.getvalue() out.close() rev = splitBySentences(rev) rev["format"] = "plaintext" return rev else: return rev else: return rev
def get_title(squery): #Gets the wiki page for the title squery. url = "http://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=json&"+str(urllib.urlencode({"titles":squery})) while True: try: dat = urllib.urlopen(url).read() dat = json.loads(dat) break except ValueError: #Proxy might reject request. print "Retrying" pass kys = dat['query']['pages'].keys()[0] dat = dat['query']['pages'][kys]['revisions'][-1]['*'] redir = re.findall("#REDIRECT \[\[(.*?)\]\]",dat,re.IGNORECASE) #Handle Redirection. if(len(redir)!=0): return get_title(redir[0]) print "Retrieved " + str(squery) dat2 = we.clean(dat) return dat2 #TWEET_LENGTH word Tweet.
def cleanArticle(inCollection, outCollection): lastId = -1 if outCollection.count() != 0: lastId = outCollection.find().sort([("_id", pymongo.DESCENDING)]).limit(1)[0]["_id"] print "Starting from id greater than: {}".format(lastId) sys.stdout.flush() numCleaned = 0 for article in inCollection.find({"_id": {"$gt": lastId}}).sort([("_id", pymongo.ASCENDING)]): # Parse it. extractor = WikiExtractor.Extractor(article["_id"], article["title"], [article["text"]]) article["text"] = extractor.clean() outCollection.insert_one(article) # Print progress. numCleaned += 1 if numCleaned % 1000 == 0: print "Cleaned {} articles so far...".format(numCleaned) sys.stdout.flush() return numCleaned
def get_wiki_page_clean(article_title): xml_str = get_article_xmlpage(article_title) clean_str = WikiExtractor.run(xml_str, keep_sections=False, keep_links=True) return clean_str
infobox = utility.extractInfobox(pageText) if infobox != "": infoboxList[page_ID] = infobox # extract heads headings = utility.extractHeads(pageText) for head in headings: head = head.replace('=', '').strip() headDictionary[page_ID].append(head) # extract category categories = utility.extractCategory(pageText) for cat in categories: categoryDictionary[page_ID].append(cat) anchors, anchorSurfaces = WikiExtractor.getAnchor(pageText) #create surface dictionary for anchorSurface in anchorSurfaces: anchorSurface = anchorSurface.replace('\n', '').replace('\r', '') temp = anchorSurface.split("@@") surface = temp[1] anchor = temp[0] #change encoding anchor = anchor.encode("utf-8") #capitalize the first letter anchor = anchor[0:1].capitalize() + anchor[1:] # remove # sign if '#' in anchor: anchor = anchor[0:anchor.index('#')]
def words(self, normalise=False, strict_words=True, lowercase=False): #? ! . ?" !" ." ?'' !'' .'' sentence_end_re = re.compile( u"(?:\.|\?|!|\.''|\?''|!''|\?\"|!\"|\.\")$", re.U) class outter(object): def __init__(self): self.ls = [] def write(self, l): self.ls.append(l) def text(self): return u"".join(self.ls[1:-1]) pages = 0 for i, (id, title, page) in enumerate(self.pages()): pages += 1 out = outter() WikiExtractor.Extractor(id, title, page).extract(out) lastw = None for w in out.text().split(): wnorm = w # special case ==Zdroje if lastw is None or sentence_end_re.search(lastw): sentence_start = True else: sentence_start = False if not sentence_start: if w.startswith("==") or lastw.endswith("=="): sentence_start = True if normalise: wnorm = self.normalise(w, True, False) if strict_words: if wnorm.isupper() or wnorm.isnumeric(): wnorm = "" else: wnorm1 = self.normalise(wnorm, False, True) if len(wnorm1) != len(wnorm): wnorm = "" if lowercase and 0 < len(wnorm): wnorm = wnorm.lower() # TODO debug # if wnorm in( # u"Má", # ): # if sentence_start: # pass # else: # pass if 0 == len(wnorm): lastw = w continue if not sentence_start and w[0].isupper(): pass if sentence_start and not w[0].isupper(): pass yield w, wnorm, sentence_start, pages lastw = w
text = wiki._normalise_re_apos1.sub(ur'\1"', text) text = wiki._normalise_re_apos2.sub(ur'"\1', text) text = wiki._normalise_re_apos3.sub(ur'"', text) text = wiki._normalise_re_non_letter_start.sub(ur'', text) text = wiki._normalise_re_non_letter_end.sub(ur'', text) if inner: text = wiki._normalise_re_non_letter.sub(ur'', text) return text if __name__ == '__main__': w = wiki("../skwiki-20151226-pages-articles.xml") class outter(object): def __init__(self): self.ls = [] def write(self, l): self.ls.append(l) def text(self): return "".join(self.ls[1:-1]) for i, (id, title, page) in enumerate(w.pages()): out = outter() WikiExtractor.Extractor(id, title, page).extract(out) print out.text() if i > 5000: break
def cleaner_WikiExtractor(wikidoc): wikidoc.clean_text = WikiExtractor.clean(wikidoc.wiki_text) return wikidoc
def sampler(title, propertyWorder, wikipediaDump, sampleSentences): language = propertyWorder.getLanguage() print('Working on `%s`' % title) projectedTitle = unquote(title.replace('_',' ')).decode(encoding='utf-8') primaryTitleLabels = {projectedTitle} print('Going with "%s"' % (projectedTitle)) titleLabel = primaryTitleLabels.pop() ############################################################################ # Retrieve article for subject # print_n_flush('Retrieving article from Wikipedia...') # We do this instead, fetching the article from the wikipedia dump strTitleLabel = unidecode(titleLabel) try: rawArticle = wikipediaDump.get_page_contents_by_title(strTitleLabel) except KeyError: message = "Could not fetch the article for " + titleLabel logging.warning(message) print(message) return article = rawArticle.decode('utf-8') print 'OK' ### Expand relevant templates in the Wikipedia article print_n_flush('Expanding relevant templates...') article = removeSectionTitles(article) article = expandTemplates(article, propertyWorder) print 'OK' #END# Templates expansion ### Wiki markup cleaning print_n_flush('Getting rid of wiki markup...') # Preliminary cleanup article = WikiExtractor.clean(article) # Final cleanup (turning text into a list of section titles and paragraphs) article = WikiExtractor.compact(article) print 'OK' #END# Wiki markup cleaning for paragraph in article: """ Account for a bug in the PunktSentenceTokenizer when handling sentence-ending marks followed by a double quote mark """ paragraph = paragraph.replace('?"', '? "') paragraph = paragraph.replace('!"', '! "') paragraph = paragraph.replace('."', '. "') #TODO: Language-agnostic sentence tokenizer sentences = tokenize_sentence(paragraph) for sentence in sentences: sentence = propertyWorder.adjustText(sentence) sampleSentences.append(sentence)
def get_keywords(title, primary): url_non_en = "https://" + primary + ".wikipedia.org/wiki/Special:Export/" + title resp = requests.get(url_non_en) with open('non_en.xml', 'wb') as f: f.write(resp.content) page = wptools.page(title, lang=primary) # page = wptools.page(title,lang = "hi") page.get_parse() wikidata_id = page.data['wikibase'] # print("Wikidata Id obtained :" + wikidata_id) page_en = wptools.page(wikibase=wikidata_id) page_en.get_wikidata() title_en = page_en.data['title'] url_en = "https://en.wikipedia.org/wiki/Special:Export/" + title_en resp = requests.get(url_en) with open('eng.xml', 'wb') as f: f.write(resp.content) # Execute the WikiExtractor.py code to process the non-en and en XMLs WikiExtractor.main() print("Code Executed") non_en_text = "" en_text = "" with open("non_en/AA/wiki_00", 'r') as f: non_en_text = f.read() f.close() with open("en/AA/wiki_00", 'r') as f: en_text = f.read() f.close() # page_en = wptools.page(title_en) # page_en.get_parse() url = "https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&ppprop=wikibase_item&redirects=1&titles=" + title_en url += "&format=json" resp = requests.get(url) data = resp.content data = data.decode('utf8') data = json.loads(data) pages = data["query"]["pages"] for items in pages: # print(items) try: ID = pages[items]["pageprops"]["wikibase_item"] title_en = pages[items]["title"] except KeyError: continue print("Title English") print(title_en) # --------------------------------- Obtain section headings from the English XML page ------------------------ section_headings = [i.start() for i in re.finditer("<sec>", en_text)] section_headings_end = [i.start() for i in re.finditer("</sec>", en_text)] print(len(section_headings)) headings = [] headings_pos = [] for j in range(len(section_headings)): occurence = section_headings[j] title = en_text[occurence + 5:section_headings_end[j] - 1] headings.append(title) headings_pos.append(section_headings_end[j] + 6) # --------------------------------- Part Considering anchor text as keywords---------------------------------- start = time.process_time() keywords_en = [] keywords = [] occur_en = [i.start() for i in re.finditer("href", en_text)] end_occur_en = [i.start() for i in re.finditer("</a>", en_text)] occur = [i.start() for i in re.finditer("href", non_en_text)] end_occur = [i.start() for i in re.finditer("</a>", non_en_text)] pos_keywords_en = {} for j in range(len(occur_en)): occurence = occur_en[j] title = "" pos = occurence + 6 while (en_text[pos] != "\""): title += en_text[pos] pos += 1 pos += 2 # print(title) url = title # print(en_text[pos:end_occur_en[j]]) title = en_text[pos:end_occur_en[j]] # title = title.lower() if (title_en.find(title) != -1): continue elif ((len(url) / len(title)) > 3): continue if title in pos_keywords_en: repitition = True else: pos_keywords_en[title] = occurence + 6 keywords_en.append(title) for j in range(len(occur)): occurence = occur[j] title = "" pos = occurence + 6 while (non_en_text[pos] != "\""): title += non_en_text[pos] pos += 1 pos += 2 url = title title = non_en_text[pos:end_occur[j]] if (title == 'के'): continue keywords.append(title) dict_keys = {} dict_keys_en = {} mappings_eng = {} mappings_non_en = {} for i in range(0, len(keywords_en), 50): # print(i) url = "https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&ppprop=wikibase_item&redirects=1&titles=" count = 0 j = i while (j < len(keywords_en) and count < 50): url += keywords_en[j] # url += urls_en[j] if (count != 49): url += "|" count += 1 j += 1 url += "&format=json" resp = requests.get(url) data = resp.content # print(data.json()) data = data.decode('utf8') data = json.loads(data) pages = data["query"]["pages"] for items in pages: # print(items) try: ID = pages[items]["pageprops"]["wikibase_item"] title = pages[items]["title"] dict_keys_en[ID] = title mappings_eng[title] = title except KeyError: # print("error") continue normalizations = {} try: normalized = data["query"]["normalized"] for items in normalized: try: normalizations[items["to"]] = items["from"] except KeyError: continue except KeyError: continue try: redirects = data["query"]["redirects"] for items in redirects: try: if items["from"] in normalizations: mappings_eng[items["to"]] = normalizations[ items["from"]] else: mappings_eng[items["to"]] = items["from"] except KeyError: continue except KeyError: continue for i in range(0, len(keywords), 50): # print(i) url = "https://" + primary + ".wikipedia.org/w/api.php?action=query&prop=pageprops&ppprop=wikibase_item&redirects=1&titles=" count = 0 j = i while (j < len(keywords) and count < 50): url += keywords[j] # url += urls[j] if (count != 49): url += "|" count += 1 j += 1 url += "&format=json" resp = requests.get(url) data = resp.content # print(data.json()) data = data.decode('utf8') data = json.loads(data) pages = data["query"]["pages"] for items in pages: # print(items) try: ID = pages[items]["pageprops"]["wikibase_item"] title = pages[items]["title"] dict_keys[ID] = title mappings_non_en[title] = title except KeyError: # print("error") continue normalizations = {} try: normalized = data["query"]["normalized"] for items in normalized: try: normalizations[items["to"]] = items["from"] except KeyError: continue except KeyError: continue try: redirects = data["query"]["redirects"] for items in redirects: try: if items["from"] in normalizations: mappings_non_en[items["to"]] = normalizations[ items["from"]] else: mappings_non_en[items["to"]] = items["from"] except KeyError: continue except KeyError: continue print(len(dict_keys_en)) print(len(dict_keys)) # translator = Translator() relevant_english = [] for keys in dict_keys_en: try: temp = dict_keys[keys] except: # relevant_english.append(dict_keys_en[keys]) relevant_english.append(mappings_eng[dict_keys_en[keys]]) unique_non_en = [] s = 0 for keys in dict_keys: try: temp = dict_keys_en[keys] except: # unique_non_en.append(dict_keys[keys]) unique_non_en.append(mappings_non_en[dict_keys[keys]]) relevant_english_links = [] base_link = "https://en.wikipedia.org/wiki/" + title_en.replace(" ", "_") # print(relevant_english) # print(pos_keywords_en) for key_title in relevant_english: # key_title = key_title.lower() section_level = len(headings_pos) - 1 if (key_title not in pos_keywords_en): relevant_english_links.append(base_link) continue while (section_level >= 0): if (pos_keywords_en[key_title] > headings_pos[section_level]): break section_level = section_level - 1 if (pos_keywords_en[key_title] < headings_pos[0]): relevant_english_links.append(base_link) else: link_to_section = base_link + "#" + headings[ section_level].replace(" ", "_") relevant_english_links.append(link_to_section) # chl position here end = time.process_time() print("Time taken to obtain mapping between keywords and page ids") print(end - start) print( "--------------------------------------------------------------------------------" ) print("Wikipedia2vec execution begins") start = time.process_time() similarity_score = [] entity_found = False try: title_vec = wiki2vec.get_entity_vector(title_en) entity_found = True except: entity_found = False count = 0 if (entity_found): out = open("scores.txt", "w") for i in range(len(relevant_english)): score = 0 try: key_vec = wiki2vec.get_entity_vector(relevant_english[i]) dot = np.dot(title_vec, key_vec) norma = np.linalg.norm(title_vec) normb = np.linalg.norm(key_vec) cos = dot / (norma * normb) score = cos except: key_found = False count += 1 score = 0 similarity_score.append(score) out.write( str(relevant_english[i]) + "--> " + str(similarity_score[i]) + "\n") print("Len relevant english and similariy score") print(len(relevant_english)) print(len(similarity_score)) order = np.argsort(similarity_score) print(len(order)) other_index = [] other_index_link = [] for i in range(len(order)): other_index.append(relevant_english[order[(len(order) - 1) - i]]) other_index_link.append( relevant_english_links[order[(len(order) - 1) - i]]) relevant_english = other_index relevant_english_links = other_index_link else: do_something = 0 count = -1 # the relevant english list, unchanged needs to be shown, as the Wikipedia2vec hasn't returned any entity vector for the title # keywords itself and therefore we cannot calculate the similarity values between the title and the extracted keywords print(len(relevant_english)) # print(final) end = time.process_time() print("Time taken to get similarity scores") print(end - start) print("API calls completed") print("Keys not found = ") print(count) # out.close() # --------------------------------- Part Considering anchor text as keywords Ends----------------------------- ans = {} URL_en = "https://en.wikipedia.org/wiki/" + title_en ans['url_en'] = URL_en ans["keywords"] = unique_non_en ans["English_keywords"] = relevant_english ans['links'] = relevant_english_links temp = jsonify(ans) temp.status_code = 200 return temp
from time import strftime from jsonrpclib.jsonrpc import ProtocolError import logging import sys from pairseslib.pickling import pickleDump, pickleLoad if __name__ == '__main__': logging.basicConfig(filename=os.path.join(cfg['home'], 'wikiread.log'), level=logging.DEBUG, format=cfg['logtimestampformat']) # Open the Wikipedia dump through wikidump wikipediaDump = wikiModel.Dump( '/Volumes/Data/wikidump/enwiki-20130304-pages-articles.xml', False, False) # Instantiate the English Wikipedia worder propertyWorder = EnglishWikipediaModule() text = wikipediaDump.get_page_contents_by_title('Bern').decode('utf-8') text = expandTemplates(text, propertyWorder) # Preliminary wiki markup cleanup text = WikiExtractor.clean(text) # Final wiki markup cleanup (turning text into a list of section titles and paragraphs) text = WikiExtractor.compact(text) for line in text: print(line.encode('utf-8'))
def patternHarvester(title, propertyWorder, wikipediaDump): language = propertyWorder.getLanguage() sourceWiki = language print('Working on `%s`' % title) ############################################################################ # Fetch triples for subject # print_n_flush('Querying DBPedia...') subjectIRI = expandIRI('dbpedia:' + title) subjectTriples = fetchSubjectTriples(subjectIRI, language, False, False) print 'OK' # End of "Fetch triples for subject" # ############################################################################ # Obtain title for the article (i.e. primary subject name) primaryTitleLabels = getValuesForPredicate(subjectTriples, 'rdfs:label') # We are pretty sure right now is a singleton # (i.e. there is one triple for predicate rdfs:label) try: assert (len(primaryTitleLabels) == 1) except: projectedTitle = title.replace('_', ' ') primaryTitleLabels = {unicode(projectedTitle)} message = "Could not find a primary label for %s, will try %s" % ( title, projectedTitle) print(message) titleLabel = primaryTitleLabels.pop() ############################################################################ # Retrieve article for subject # print_n_flush('Retrieving article from Wikipedia...') # We don't do this anymore # article = getCurrentWikiArticleText(sourceWiki, title) # We do this instead, fetching the article from the wikipedia dump strTitleLabel = unidecode(titleLabel) try: rawArticle = wikipediaDump.get_page_contents_by_title(strTitleLabel) except KeyError: message = "Could not fetch the article for " + titleLabel logging.warning(message) print(message) return article = rawArticle.decode('utf-8') print 'OK' # End of "Retrieve article for subject" ############################################################################ subjectWordings = set() subjectWordings.add(titleLabel) # Retrieve secondary names (obtained from redirects to the primary article) # and add them as subject labels subjectWordings |= otherLabels(subjectIRI, language) # Filter and get the labels for the classes the subject is an instance of # (e.g. Los Angeles would have "city" as a label to an object for a # rdf:type triple) subjectClasses = getLabelsForPredicate(subjectTriples, 'rdf:type') wordedClassLabels = set() for classLabel in subjectClasses: captlzd, uncptlzd = propertyWorder.getClassLabelWording(classLabel) wordedClassLabels.add(uncptlzd) wordedClassLabels.add(captlzd) subjectWordings |= wordedClassLabels ### Compute and annotate wordings for triple objects annotatedSubjectWordings = list() # Cycle through all wordings for the subject and get an annotation # for each one for subjectWording in subjectWordings: try: (root, words, graph) = annotateText(subjectWording) except AnnotationError: continue annotatedSubjectWordings.append((subjectWording, (root, words, graph))) ### Compute and annotate wordings for objects in each triple print_n_flush('Finding and annotating wordings for triple objects...') annotatedObjectWordings = list() predicateOccurrences = dict() for triple in subjectTriples: predicate = triple['p']['value'] if predicate in ignored: continue if predicate not in predicateOccurrences: predicateOccurrences[predicate] = set() try: objectWording = getCommonWording(triple, propertyWorder) except CommonWordingNotFound: # TODO: Find out if any important data types are left out """ if triple['p']['value'] not in notWorded: notWorded[triple['p']['value']] = list() notWorded[triple['p']['value']].append(triple) pprint(triple['p']['value'] + '::' + triple['o']['value']) pprint(triple) """ continue try: (root, words, graph) = annotateText(objectWording) except AnnotationError: continue annotatedObjectWordings.append( (objectWording, (root, words, graph, predicate))) ### END of templates expansion print 'OK' ### Expand relevant templates in the Wikipedia article print_n_flush('Expanding relevant templates...') article = expandTemplates(article, propertyWorder) print 'OK' #END# Templates expansion ### Wiki markup cleaning print_n_flush('Getting rid of wiki markup...') # Preliminary cleanup article = WikiExtractor.clean(article) # Final cleanup (turning text into a list of section titles and paragraphs) article = WikiExtractor.compact(article) print 'OK' #END# Wiki markup cleaning # Sentence counter i = 0 j = -1 for paragraph in article: """ Account for a bug in the PunktSentenceTokenizer when handling sentence-ending marks followed by a double quote mark """ paragraph = paragraph.replace('?"', '? "') paragraph = paragraph.replace('!"', '! "') paragraph = paragraph.replace('."', '. "') #TODO: Language-agnostic sentence tokenizer sentences = tokenize_sentence(paragraph) for sentence in sentences: sentence = propertyWorder.adjustText(sentence) # Statistics for ow, (owRootWord, owWords, owGraph, predicate) in annotatedObjectWordings: if ow in sentence: predicateOccurrences[predicate].add(ow) i += 1 # Get the graph for this sentence print_n_flush('PS') # Parse the sentence through the Stanford NLP Core Tools try: (sentenceR, sentenceW, sentenceG, sentence, sentenceWData) = annotateText(sentence, True) except AnnotationError: continue legalNodeIndices = map(lambda x: int(x[x.rindex("-") + 1:]), sentenceG.nodes()) rootNode = 'ROOT-0' # From here on, the initials "sw" refer to "subject wording" for sw, (swRootWord, swWords, swGraph) in annotatedSubjectWordings: try: swRootWordIndex = matchWording(sentence, sentenceW, sentenceG, legalNodeIndices, sentenceWData, sw, swWords, swGraph, swRootWord) except ValueError as e: """No match found for wording in sentence""" continue subjectTarget = swRootWord + '-' + unicode(swRootWordIndex) # Compute and generate subgraph for shortest path to Subject # s1 will be the nodes from root to subject try: s1 = set( shortestPathFromRoot(sentence, sentenceG, subjectTarget)) except ShortestPathError: continue # From here on, the initials "ow" refer to "object wording" # Compute and generate subgraph for shortest path to Object # s2 is the set of nodes from root to object for ow, (owRootWord, owWords, owGraph, predicate) in annotatedObjectWordings: try: owRootWordIndex = matchWording( sentence, sentenceW, sentenceG, legalNodeIndices, sentenceWData, ow, owWords, owGraph, owRootWord) except ValueError as e: """No match found for wording in sentence""" continue objectTarget = owRootWord + '-' + unicode(owRootWordIndex) if objectTarget == subjectTarget: """ No use for this kind of pattern """ continue try: s2 = set( shortestPathFromRoot(sentence, sentenceG, objectTarget)) except ShortestPathError: continue # At this point, we definitely have a pattern # Nodes in the spanning tree comprising solely the shortest # paths to the subject and to the object s = s1 | s2 # S is the aforementioned spanning tree S = nx.DiGraph(sentenceG.subgraph(s), name=predicate) anonRoot = unicode(cfg['roottag'] + '-0') anonSubject = unicode(cfg['subjecttag'] + '-' + unicode(swRootWordIndex)) anonObject = unicode(cfg['objecttag'] + '-' + unicode(owRootWordIndex)) renamings = dict() renamings[rootNode] = anonRoot renamings[subjectTarget] = anonSubject renamings[objectTarget] = anonObject entities = list() numerals = 0 try: for node in S.nodes(): if node not in renamings.keys(): if propertyWorder.partOfProperNoun(node): """ The word may refer to an entity, in this case let's abstract from the word and save a relation for this pattern""" index = int(node[node.rindex('-') + 1:]) anonEntity = '%s%05d-%d' % ( cfg['entitytagprefix'], len(entities), index) renamings[node] = anonEntity entityWording = associatedWording( sentence, node, sentenceG, sentenceWData, allowNestedWordingMatch=True) entities.append( (entityWording, getClasses(entityWording, language))) elif isNumeric(node): index = int(node[node.rindex('-') + 1:]) anonNumeral = '%s%05d-%d' % ( cfg['numerictagprefix'], numerals, index) numerals += 1 renamings[node] = anonNumeral except AnnotationError: continue # First anonymize subject, object and entities S = nx.relabel_nodes(S, renamings) # Remove indices as well indexlessNodes = map(lambda word: word[0:word.rindex("-")], S.nodes()) S = nx.relabel_nodes(S, dict(zip(S.nodes(), indexlessNodes))) if '' in S.nodes(): """ A bug in either the SCNLP or the python wrapper makes empty nodes out of schwas and other unicode chars that might be used as a diacritic""" # TODO: Find a fix for this message = 'Invalid dependencies for this sentence: ' + sentence logging.warning(message) print(message) continue # DOT representation of the graph pydotS = nx.to_pydot(S).to_string().encode( encoding='UTF-8', errors='strict') pattern = Pattern(pydotS, predicate, entities, title, sw, ow, sentence) try: saveGraph(S, pattern.hash) except (TypeError, UnicodeEncodeError): # TODO: Fix this "TypeError: coercing to # Unicode: need string or buffer, NoneType found" error # also : "UnicodeEncodeError: 'ascii' codec can't encode character" checkLog = True logging.warning('A graph could not be saved: ' 'Sentence: ' + sentence + 'Nodes: ' + str(S.nodes()) + 'Edges: ' + str(S.edges(data=True))) pass storePredicateOccurrences(title, predicateOccurrences)