def naivepatternHarvester(title, propertyWorder, wikipediaDump,
                          naivePredicateStatistics, naiveSubjectStatistics):
    language = propertyWorder.getLanguage()

    print('Working on `' + title + '`')

    # Get triples from DBPedia
    print_n_flush('Querying DBPedia...')

    iri = '<' + namespaces['dbpedia'] + title + '>'
    dbpediaData = fetchSubjectTriples(iri, language)

    # End of DBPedia get triples
    print 'OK'

    sourceWiki = language

    # Retrieve Wikipedia article
    print_n_flush('Retrieving article from Wikipedia...')

    # Obtain a pattern graph for the subject
    titleLabelSingleton = getPredicateValues(dbpediaData, 'rdfs:label')

    # We are pretty sure right now extendendSubjectLabels is a singleton (i.e. there is only one triple for predicate rdfs:label)
    try:
        assert (len(titleLabelSingleton) == 1)
    except:
        return
    titleLabel = iter(titleLabelSingleton).next()

    # We don't do this anymore
    # text = getCurrentWikiArticleText(sourceWiki, title)
    # We do this instead
    try:
        text = wikipediaDump.get_page_contents_by_title(
            unidecode(titleLabel)).decode('utf-8')
    except KeyError:
        print_n_flush('\nCould not find a page with this title: "' +
                      unidecode(titleLabel) + '", skipping')
        return

    # End of Wikipedia article retrieving
    print 'OK'

    # Remove wiki markup
    print_n_flush('Getting rid of wiki markup...')

    # Preliminary wiki markup cleanup
    text = WikiExtractor.clean(text)
    # Final wiki markup cleanup (turning text into a list of section titles and paragraphs)
    text = WikiExtractor.compact(text)

    # End of wiki markup cleaning
    print 'OK'

    mergedText = u' '.join(text)
    naivepatterns.naiveStatistics(title, mergedText, dbpediaData,
                                  propertyWorder, naivePredicateStatistics,
                                  naiveSubjectStatistics, 3, False)
Exemplo n.º 2
0
 def get_kb_description(self, topic_title):
     raw_content = wikipedia_api_util.get_raw_page_text(topic_title)
     cleaned = WikiExtractor.clean(raw_content)
     compacted = WikiExtractor.compact(cleaned)
     desc = ' '.join(compacted)
     if desc is None or desc.strip()=='':
         return topic_title
     return desc
Exemplo n.º 3
0
 def get_kb_description(self, topic_title):
     raw_content = wikipedia_api_util.get_raw_page_text(topic_title)
     cleaned = WikiExtractor.clean(raw_content)
     compacted = WikiExtractor.compact(cleaned)
     desc = ' '.join(compacted)
     if desc is None or desc.strip() == '':
         return topic_title
     return desc
Exemplo n.º 4
0
def run(path_to_dump, wiki_files_dir='wiki', path_to_res='res_wiki.csv', workers_num=3):
    # step #1 - process files via wikiextractor
    argv = shlex.split(f'-o {wiki_files_dir} --json --processes {workers_num} {path_to_dump}')
    sys.argv = [sys.argv[0]] + argv
    print(argv)
    WikiExtractor.main()

    # step #2 - postporcessing
    postprocessing.run(wiki_files_dir, path_to_res, workers_num)
Exemplo n.º 5
0
    def clean_markups(self, text):
        if not text:
            return ""

        clean_text = WikiExtractor.clean(text)
        clean_frags = WikiExtractor.compact(clean_text)
        clean_html = [re.sub(HTML_TAG_REGEX, '', frag) for frag in clean_frags]

        return "\n".join(clean_html) if len(clean_html) > 0 else ""
 def remove_markup(self):
     # First fix wiktioanry links that aren't being handled properly
     # by the WikiExtractor library.
     wikt = r"\[{2,}wikt:[^\|]+\|([^\]]+)\]{2,}"
     self.text = re.sub(wikt, r'\1', self.text)
     broken_wikt = r"{{broken wikt link\|([^\|}]+)(?:\|([^}]+))?}{2,}"
     self.text = re.sub(broken_wikt, r'\1', self.text)
     # Use the WikiExtractor library to finish processing
     self.text = WikiExtractor.clean(self.text)
     self.text = '\n'.join(WikiExtractor.compact(self.text))
Exemplo n.º 7
0
    def clean_markups(self, text):
        if not text:
            return ""

        clean_text = WikiExtractor.clean(text)
        clean_frags = WikiExtractor.compact(clean_text)
        clean_html = [re.sub(HTML_TAG_REGEX, '', frag)
                      for frag in clean_frags]

        return "\n".join(clean_html) if len(clean_html) > 0 else ""
Exemplo n.º 8
0
 def remove_markup(self):
     """Remove wiki markup leaving just the plain-text."""
     # First fix wiktioanry links that aren't being handled properly
     # by the WikiExtractor library.
     wikt = r"\[{2,}wikt:[^\|]+\|([^\]]+)\]{2,}"
     self.text = re.sub(wikt, r'\1', self.text)
     broken_wikt = r"{{broken wikt link\|([^\|}]+)(?:\|([^}]+))?}{2,}"
     self.text = re.sub(broken_wikt, r'\1', self.text)
     # Use the WikiExtractor library to finish processing
     self.text = WikiExtractor.clean(self.text)
     self.text = '\n'.join(WikiExtractor.compact(self.text))
Exemplo n.º 9
0
    def page_handler(page):
        global db_cursor
        global db

        try:

            if 'redirect' in page:
                synonym_data = {
                    'synonym': page['title'] + ';',
                    'redirect': page['redirect']
                }

                db_cursor.execute(
                    """
                            UPDATE articles  
                            SET synonyms = 
                                IFNULL(CONCAT(synonyms, %(synonym)s), %(synonym)s)
                            WHERE title = %(redirect)s
                            """, synonym_data)
                #print('Number of rows inserted: %d' % db_cursor.rowcount)
                db.commit()
                return
            """Write the right bits to the right files."""
            #print(page['title'])
            #print(page['title'])
            #print("page_id :",page['id'])
            #print(page['redirect'])
            #print("time :",page['revisions'][-1]['timestamp'])
            text = HTMLParser.HTMLParser().unescape(
                page['revisions'][-1]['text'])
            text = ''.join(BeautifulSoup(text).findAll(text=True))
            text = WikiExtractor.clean(text)
            text = ''.join(WikiExtractor.compact(text))
            #print(text)

            article_data = {
                'id': page['id'],
                'title': page['title'],
                'timestamp': page['revisions'][-1]['timestamp'],
                'text': text
            }
            print(page['id'])
            db_cursor.execute(
                """
                        INSERT INTO articles(id, title, timestamp, text) 
                            VALUES (%(id)s, %(title)s, %(timestamp)s, %(text)s)
                        """, article_data)

            #print('Number of rows inserted: %d' % db_cursor.rowcount)
            db.commit()
        except Exception, e:
            print >> sys.stderr, "invoked error. id : %s, %s" % (page['id'], e)
Exemplo n.º 10
0
    def page_handler(page):
        global db_cursor
        global db


        try:

                if 'redirect' in page:
                    synonym_data = {
                        'synonym': page['title'] + ';',
                        'redirect': page['redirect']
                    }

                    db_cursor.execute("""
                            UPDATE articles  
                            SET synonyms = 
                                IFNULL(CONCAT(synonyms, %(synonym)s), %(synonym)s)
                            WHERE title = %(redirect)s
                            """, synonym_data)
                    #print('Number of rows inserted: %d' % db_cursor.rowcount)
                    db.commit()
                    return

                """Write the right bits to the right files."""
                #print(page['title'])
                #print(page['title'])
                #print("page_id :",page['id'])
        #print(page['redirect'])
                #print("time :",page['revisions'][-1]['timestamp'])
                text = HTMLParser.HTMLParser().unescape(page['revisions'][-1]['text'])
                text = ''.join(BeautifulSoup(text).findAll(text=True))
                text = WikiExtractor.clean(text)
                text = ''.join(WikiExtractor.compact(text))
                #print(text)

                article_data = {
                    'id': page['id'],
                    'title': page['title'],
                    'timestamp': page['revisions'][-1]['timestamp'],
                    'text': text
                }
                print(page['id'])
                db_cursor.execute("""
                        INSERT INTO articles(id, title, timestamp, text) 
                            VALUES (%(id)s, %(title)s, %(timestamp)s, %(text)s)
                        """, article_data)

                #print('Number of rows inserted: %d' % db_cursor.rowcount)
                db.commit()
        except Exception, e:
            print >> sys.stderr, "invoked error. id : %s, %s" % (page['id'], e)
Exemplo n.º 11
0
def format_wikicorpus(input, output, bytes, num_process, num_out_files):
    if input is None:
        raise ValueError('input file is empty.')
    if not input.endswith('xml.bz2'):
        raise ValueError('input file not *.xml.bz2.')
    if not os.path.exists(output):
        os.makedirs(output)

    # Use WikiExtractor to extract the content
    WikiExtractor = try_import_wikiextractor()
    wiki_path = os.path.join(output, 'extracted')
    sys.argv = ['prog', '-b', bytes, '-o', wiki_path, input]
    WikiExtractor.main()

    # Merge extracted content into txt files
    prepared_path = os.path.join(output, 'prepared_wikipedia')
    if not os.path.exists(prepared_path):
        os.makedirs(prepared_path)
    filenames = get_formatting_list(wiki_path, recursive=True)
    num_files = len(filenames)
    num_out_files = min(num_out_files, num_files)
    file_volume = math.ceil(num_files / num_out_files)
    splited_files = [
        filenames[i:i + file_volume] for i in range(0, num_files, file_volume)
    ]
    num_out_files = len(splited_files)
    output_files = [
        os.path.join(prepared_path,
                     "wikipedia-prepared-{}.txt".format(str(i).zfill(4)))
        for i in range(num_out_files)
    ]
    print("All prepared raw text will be saved in {} txt files".format(
        num_out_files))
    num_process = min(num_process, num_out_files)
    print('Start preprocessing {} text files with {} cores'.format(
        num_files, num_process))
    process_args = [(splited_files[i], output_files[i])
                    for i in range(num_out_files)]

    start_time = time.time()
    with multiprocessing.Pool(num_process) as pool:
        f_read = 0
        for i, _ in enumerate(pool.imap(merge, process_args)):
            elapsed = time.time() - start_time
            f_read += len(splited_files[i])
            print(
                "prepared {:} files, Elapsed: {:.2f}s, ETA: {:.2f}s, ".format(
                    f_read, elapsed,
                    (num_files - f_read) / (num_files / elapsed)))
    print("Done preparation within {:.2f} seconds".format(elapsed))
Exemplo n.º 12
0
def cleaner_Both(wikidoc):
    '''Use WikiExtractor for cleaning
       Use Parser from hell for links
    '''
    wikidoc.clean_text = WikiExtractor.clean(wikidoc.wiki_text)
    wp = WikiTextProcessor(wikidoc.wiki_text)
    wikidoc.meta[WdNames.LINKS] = wp.get_links()
    return wikidoc
Exemplo n.º 13
0
def cleaner_Both(wikidoc):
    '''Use WikiExtractor for cleaning
       Use Parser from hell for links
    '''
    wikidoc.clean_text = WikiExtractor.clean(wikidoc.wiki_text)
    wp = WikiTextProcessor(wikidoc.wiki_text)
    wikidoc.meta[WdNames.LINKS] = wp.get_links()
    return wikidoc
Exemplo n.º 14
0
def go(name, date, cache=False):

    query_fmt = \
        'http://en.wikipedia.org/w/api.php/w/api.php?' \
        'action=query'    \
        '&format=json'    \
        '&prop=revisions' \
        '&list='          \
        '&pageids={}'     \
        '&rvsection=0'    \
        '&rvprop=timestamp%7Ccontent' \
        '&rvstart={:04d}-{:02d}-{:02d}T00%3A00%3A00.000Z'

    # format the query
    timestamp = "{}-{}-{}".format(date.year, date.month, date.day)
    query = query_fmt.format(page_ids[name], date.year, date.month, date.day)
    sentiment_result = SentimentResult(name, page_ids[name], timestamp)

    # check to see if we actually need to perform the lookup
    if cache and sentiment_result.is_cached:
        sentiment_result.sync()
        return sentiment_result
    
    # if we don't have it in the cache, perform the query
    data  = json.loads(requests.get(query).text)

    # parse the result with BeautifulSoup
    wiki_markup  = data['query']['pages'][str(page_ids[name])]['revisions'][0]['*']
    
    def format(text):
        lines = text.split('\n')
        return ' '.join([i for i in lines if i][1:-1])

    # extract readable text from the markup
    extractor = WikiExtractor.Extractor(page_ids[name], 0, name, wiki_markup)
    sio = io.StringIO()
    extractor.extract(sio)
    sio.seek(0)
    text = format(sio.read())
    
    # score the result with Google's sentiment analysis
    score, magnitude = analyze(text)
    sentiment_result.score = score
    sentiment_result.magnitude = magnitude
    sentiment_result.length = len(text)
    
    # cache to a file, if necessary
    if cache: sentiment_result.cache()

    return sentiment_result
Exemplo n.º 15
0
def extractCleanText(page, anchorID, english27, Title_ID_All, Redirect_Hashmap,
                     In_Link_Graph_Degree, min_degree):
    """
    extract the clean text from Wikipedia page (tag <text> of the dump)
    @param page: Wikipedia page (tag <text> of the dump)
    @param anchor: If ture the anchor ID will be replace unless the surface form will be replaced in the internal links
    @return: clean text of Wikipedia page
    """
    page = WikiExtractor.cleanText(page, anchorID, Title_ID_All,
                                   Redirect_Hashmap, In_Link_Graph_Degree,
                                   min_degree, english27)
    if english27:
        page = clean27English(
            page)  # Convert to 27 English ASCII char plus space
    return page
Exemplo n.º 16
0
def renderRevision(rev, title):
    """Renders revision dictionary in HTML/WikiMarkup into plaintext. TODO Html conversion!"""

    if (rev["*"] != None):
        if (rev["format"] == "wikimarkup"):
            text = rev["*"]
            out = io.StringIO()
            extractor = WikiExtractor.Extractor(0, 0, title, text.split("\n"))
            extractor.extract(out)
            rev["*"] = out.getvalue()
            out.close()
            rev = splitBySentences(rev)
            rev["format"] = "plaintext"
            return rev
        else:
            return rev
    else:
        return rev
Exemplo n.º 17
0
def get_title(squery):
#Gets the wiki page for the title squery.
    url = "http://en.wikipedia.org/w/api.php?action=query&prop=revisions&rvprop=content&format=json&"+str(urllib.urlencode({"titles":squery}))
    while True:
        try:
            dat = urllib.urlopen(url).read()
            dat = json.loads(dat)
            break
        except ValueError:      #Proxy might reject request.
            print "Retrying"
            pass
    kys = dat['query']['pages'].keys()[0]
    dat = dat['query']['pages'][kys]['revisions'][-1]['*']
    redir = re.findall("#REDIRECT \[\[(.*?)\]\]",dat,re.IGNORECASE) #Handle Redirection.
    if(len(redir)!=0):
        return get_title(redir[0])
    print "Retrieved " + str(squery)
    dat2 = we.clean(dat)
    return dat2             #TWEET_LENGTH word Tweet.
Exemplo n.º 18
0
def cleanArticle(inCollection, outCollection):
    lastId = -1
    if outCollection.count() != 0:
        lastId = outCollection.find().sort([("_id", pymongo.DESCENDING)]).limit(1)[0]["_id"]
        print "Starting from id greater than: {}".format(lastId)
        sys.stdout.flush()
    numCleaned = 0
    for article in inCollection.find({"_id": {"$gt": lastId}}).sort([("_id", pymongo.ASCENDING)]):
        # Parse it.
        extractor = WikiExtractor.Extractor(article["_id"], article["title"], [article["text"]])
        article["text"] = extractor.clean()
        outCollection.insert_one(article)

        # Print progress.
        numCleaned += 1
        if numCleaned % 1000 == 0:
            print "Cleaned {} articles so far...".format(numCleaned)
            sys.stdout.flush()

    return numCleaned
Exemplo n.º 19
0
def get_wiki_page_clean(article_title):
    xml_str = get_article_xmlpage(article_title)
    clean_str = WikiExtractor.run(xml_str, keep_sections=False, keep_links=True)
    return clean_str
Exemplo n.º 20
0
def get_wiki_page_clean(article_title):
    xml_str = get_article_xmlpage(article_title)
    clean_str = WikiExtractor.run(xml_str,
                                  keep_sections=False,
                                  keep_links=True)
    return clean_str
Exemplo n.º 21
0
    infobox = utility.extractInfobox(pageText)
    if infobox != "":
        infoboxList[page_ID] = infobox

    # extract heads
    headings = utility.extractHeads(pageText)
    for head in headings:
        head = head.replace('=', '').strip()
        headDictionary[page_ID].append(head)

    # extract category
    categories = utility.extractCategory(pageText)
    for cat in categories:
        categoryDictionary[page_ID].append(cat)

    anchors, anchorSurfaces = WikiExtractor.getAnchor(pageText)

    #create surface dictionary
    for anchorSurface in anchorSurfaces:
        anchorSurface = anchorSurface.replace('\n', '').replace('\r', '')
        temp = anchorSurface.split("@@")
        surface = temp[1]
        anchor = temp[0]

        #change encoding
        anchor = anchor.encode("utf-8")
        #capitalize the first letter
        anchor = anchor[0:1].capitalize() + anchor[1:]
        # remove # sign
        if '#' in anchor:
            anchor = anchor[0:anchor.index('#')]
Exemplo n.º 22
0
    def words(self, normalise=False, strict_words=True, lowercase=False):

        #? ! . ?" !" ." ?'' !'' .''
        sentence_end_re = re.compile(
            u"(?:\.|\?|!|\.''|\?''|!''|\?\"|!\"|\.\")$", re.U)

        class outter(object):
            def __init__(self):
                self.ls = []

            def write(self, l):
                self.ls.append(l)

            def text(self):
                return u"".join(self.ls[1:-1])

        pages = 0
        for i, (id, title, page) in enumerate(self.pages()):
            pages += 1
            out = outter()
            WikiExtractor.Extractor(id, title, page).extract(out)
            lastw = None
            for w in out.text().split():
                wnorm = w

                # special case ==Zdroje
                if lastw is None or sentence_end_re.search(lastw):
                    sentence_start = True
                else:
                    sentence_start = False
                if not sentence_start:
                    if w.startswith("==") or lastw.endswith("=="):
                        sentence_start = True

                if normalise:
                    wnorm = self.normalise(w, True, False)

                if strict_words:
                    if wnorm.isupper() or wnorm.isnumeric():
                        wnorm = ""
                    else:
                        wnorm1 = self.normalise(wnorm, False, True)
                        if len(wnorm1) != len(wnorm):
                            wnorm = ""
                    if lowercase and 0 < len(wnorm):
                        wnorm = wnorm.lower()
                # TODO debug
                # if wnorm in(
                #         u"Má",
                # ):
                #     if sentence_start:
                #         pass
                #     else:
                #         pass
                if 0 == len(wnorm):
                    lastw = w
                    continue
                if not sentence_start and w[0].isupper():
                    pass
                if sentence_start and not w[0].isupper():
                    pass
                yield w, wnorm, sentence_start, pages
                lastw = w
Exemplo n.º 23
0
            text = wiki._normalise_re_apos1.sub(ur'\1"', text)
            text = wiki._normalise_re_apos2.sub(ur'"\1', text)
            text = wiki._normalise_re_apos3.sub(ur'"', text)
            text = wiki._normalise_re_non_letter_start.sub(ur'', text)
            text = wiki._normalise_re_non_letter_end.sub(ur'', text)
        if inner:
            text = wiki._normalise_re_non_letter.sub(ur'', text)

        return text


if __name__ == '__main__':
    w = wiki("../skwiki-20151226-pages-articles.xml")

    class outter(object):
        def __init__(self):
            self.ls = []

        def write(self, l):
            self.ls.append(l)

        def text(self):
            return "".join(self.ls[1:-1])

    for i, (id, title, page) in enumerate(w.pages()):
        out = outter()
        WikiExtractor.Extractor(id, title, page).extract(out)
        print out.text()
        if i > 5000:
            break
Exemplo n.º 24
0
def cleaner_WikiExtractor(wikidoc):
    wikidoc.clean_text = WikiExtractor.clean(wikidoc.wiki_text)
    return wikidoc
Exemplo n.º 25
0
def sampler(title, propertyWorder, wikipediaDump, sampleSentences):	

	language = propertyWorder.getLanguage()

	print('Working on `%s`' % title)

	projectedTitle = unquote(title.replace('_',' ')).decode(encoding='utf-8')
	primaryTitleLabels = {projectedTitle}
	
	print('Going with "%s"' % (projectedTitle))
	
	titleLabel = primaryTitleLabels.pop()

	############################################################################ 
	# 						Retrieve article for subject					   #
	print_n_flush('Retrieving article from Wikipedia...')

	# We do this instead, fetching the article from the wikipedia dump
	strTitleLabel = unidecode(titleLabel)
	
	try:
		rawArticle = wikipediaDump.get_page_contents_by_title(strTitleLabel)
	except KeyError:
		message = "Could not fetch the article for " + titleLabel
		logging.warning(message)
		print(message)
		return
	
	article = rawArticle.decode('utf-8')
	
	print 'OK'

	### Expand relevant templates in the Wikipedia article
	print_n_flush('Expanding relevant templates...')
	article = removeSectionTitles(article)
	article = expandTemplates(article, propertyWorder)

	print 'OK'	
	#END# Templates expansion

	### Wiki markup cleaning
	print_n_flush('Getting rid of wiki markup...')
	
	# Preliminary cleanup
	article = WikiExtractor.clean(article)
	# Final cleanup (turning text into a list of section titles and paragraphs)
	article = WikiExtractor.compact(article)

	print 'OK'
	#END# Wiki markup cleaning
	
	for paragraph in article:

		""" Account for a bug in the PunktSentenceTokenizer when handling
		 	sentence-ending marks followed by a double quote mark """
		paragraph = paragraph.replace('?"', '? "')
		paragraph = paragraph.replace('!"', '! "')
		paragraph = paragraph.replace('."', '. "')
		
		#TODO: Language-agnostic sentence tokenizer
		sentences = tokenize_sentence(paragraph)
		
		for sentence in sentences:
			sentence = propertyWorder.adjustText(sentence)
			sampleSentences.append(sentence)
Exemplo n.º 26
0
def get_keywords(title, primary):

    url_non_en = "https://" + primary + ".wikipedia.org/wiki/Special:Export/" + title
    resp = requests.get(url_non_en)
    with open('non_en.xml', 'wb') as f:
        f.write(resp.content)

    page = wptools.page(title, lang=primary)
    # page = wptools.page(title,lang = "hi")
    page.get_parse()
    wikidata_id = page.data['wikibase']
    # print("Wikidata Id obtained :" + wikidata_id)
    page_en = wptools.page(wikibase=wikidata_id)
    page_en.get_wikidata()
    title_en = page_en.data['title']
    url_en = "https://en.wikipedia.org/wiki/Special:Export/" + title_en
    resp = requests.get(url_en)
    with open('eng.xml', 'wb') as f:
        f.write(resp.content)

    # Execute the WikiExtractor.py code to process the non-en and en XMLs
    WikiExtractor.main()
    print("Code Executed")
    non_en_text = ""
    en_text = ""
    with open("non_en/AA/wiki_00", 'r') as f:
        non_en_text = f.read()
        f.close()
    with open("en/AA/wiki_00", 'r') as f:
        en_text = f.read()
        f.close()
    # page_en = wptools.page(title_en)
    # page_en.get_parse()

    url = "https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&ppprop=wikibase_item&redirects=1&titles=" + title_en
    url += "&format=json"
    resp = requests.get(url)
    data = resp.content
    data = data.decode('utf8')
    data = json.loads(data)
    pages = data["query"]["pages"]
    for items in pages:
        # print(items)
        try:
            ID = pages[items]["pageprops"]["wikibase_item"]
            title_en = pages[items]["title"]
        except KeyError:
            continue
    print("Title English")
    print(title_en)
    # --------------------------------- Obtain section headings from the English XML page ------------------------

    section_headings = [i.start() for i in re.finditer("<sec>", en_text)]
    section_headings_end = [i.start() for i in re.finditer("</sec>", en_text)]
    print(len(section_headings))
    headings = []
    headings_pos = []
    for j in range(len(section_headings)):
        occurence = section_headings[j]
        title = en_text[occurence + 5:section_headings_end[j] - 1]
        headings.append(title)
        headings_pos.append(section_headings_end[j] + 6)

    # --------------------------------- Part Considering anchor text as keywords----------------------------------
    start = time.process_time()
    keywords_en = []
    keywords = []
    occur_en = [i.start() for i in re.finditer("href", en_text)]
    end_occur_en = [i.start() for i in re.finditer("</a>", en_text)]
    occur = [i.start() for i in re.finditer("href", non_en_text)]
    end_occur = [i.start() for i in re.finditer("</a>", non_en_text)]

    pos_keywords_en = {}
    for j in range(len(occur_en)):
        occurence = occur_en[j]
        title = ""
        pos = occurence + 6
        while (en_text[pos] != "\""):
            title += en_text[pos]
            pos += 1
        pos += 2
        # print(title)
        url = title
        # print(en_text[pos:end_occur_en[j]])
        title = en_text[pos:end_occur_en[j]]
        # title = title.lower()
        if (title_en.find(title) != -1):
            continue
        elif ((len(url) / len(title)) > 3):
            continue
        if title in pos_keywords_en:
            repitition = True
        else:
            pos_keywords_en[title] = occurence + 6
        keywords_en.append(title)

    for j in range(len(occur)):
        occurence = occur[j]
        title = ""
        pos = occurence + 6
        while (non_en_text[pos] != "\""):
            title += non_en_text[pos]
            pos += 1
        pos += 2
        url = title
        title = non_en_text[pos:end_occur[j]]
        if (title == 'के'):
            continue
        keywords.append(title)

    dict_keys = {}
    dict_keys_en = {}
    mappings_eng = {}
    mappings_non_en = {}
    for i in range(0, len(keywords_en), 50):
        # print(i)
        url = "https://en.wikipedia.org/w/api.php?action=query&prop=pageprops&ppprop=wikibase_item&redirects=1&titles="
        count = 0
        j = i
        while (j < len(keywords_en) and count < 50):
            url += keywords_en[j]
            # url += urls_en[j]
            if (count != 49):
                url += "|"
            count += 1
            j += 1
        url += "&format=json"
        resp = requests.get(url)
        data = resp.content
        # print(data.json())
        data = data.decode('utf8')
        data = json.loads(data)
        pages = data["query"]["pages"]
        for items in pages:
            # print(items)
            try:
                ID = pages[items]["pageprops"]["wikibase_item"]
                title = pages[items]["title"]
                dict_keys_en[ID] = title
                mappings_eng[title] = title
            except KeyError:
                # print("error")
                continue
        normalizations = {}
        try:

            normalized = data["query"]["normalized"]
            for items in normalized:
                try:
                    normalizations[items["to"]] = items["from"]
                except KeyError:
                    continue
        except KeyError:
            continue
        try:

            redirects = data["query"]["redirects"]
            for items in redirects:
                try:
                    if items["from"] in normalizations:
                        mappings_eng[items["to"]] = normalizations[
                            items["from"]]
                    else:
                        mappings_eng[items["to"]] = items["from"]
                except KeyError:
                    continue
        except KeyError:
            continue

    for i in range(0, len(keywords), 50):
        # print(i)
        url = "https://" + primary + ".wikipedia.org/w/api.php?action=query&prop=pageprops&ppprop=wikibase_item&redirects=1&titles="
        count = 0
        j = i
        while (j < len(keywords) and count < 50):
            url += keywords[j]
            # url += urls[j]
            if (count != 49):
                url += "|"
            count += 1
            j += 1
        url += "&format=json"
        resp = requests.get(url)
        data = resp.content
        # print(data.json())
        data = data.decode('utf8')
        data = json.loads(data)
        pages = data["query"]["pages"]
        for items in pages:
            # print(items)
            try:
                ID = pages[items]["pageprops"]["wikibase_item"]
                title = pages[items]["title"]
                dict_keys[ID] = title
                mappings_non_en[title] = title
            except KeyError:
                # print("error")
                continue
        normalizations = {}
        try:

            normalized = data["query"]["normalized"]
            for items in normalized:
                try:
                    normalizations[items["to"]] = items["from"]
                except KeyError:
                    continue
        except KeyError:
            continue
        try:

            redirects = data["query"]["redirects"]
            for items in redirects:
                try:
                    if items["from"] in normalizations:
                        mappings_non_en[items["to"]] = normalizations[
                            items["from"]]
                    else:
                        mappings_non_en[items["to"]] = items["from"]
                except KeyError:
                    continue
        except KeyError:
            continue
    print(len(dict_keys_en))
    print(len(dict_keys))
    # translator = Translator()

    relevant_english = []
    for keys in dict_keys_en:
        try:
            temp = dict_keys[keys]
        except:
            # relevant_english.append(dict_keys_en[keys])
            relevant_english.append(mappings_eng[dict_keys_en[keys]])
    unique_non_en = []
    s = 0
    for keys in dict_keys:
        try:
            temp = dict_keys_en[keys]
        except:
            # unique_non_en.append(dict_keys[keys])
            unique_non_en.append(mappings_non_en[dict_keys[keys]])
    relevant_english_links = []
    base_link = "https://en.wikipedia.org/wiki/" + title_en.replace(" ", "_")
    # print(relevant_english)
    # print(pos_keywords_en)
    for key_title in relevant_english:
        # key_title = key_title.lower()
        section_level = len(headings_pos) - 1
        if (key_title not in pos_keywords_en):
            relevant_english_links.append(base_link)
            continue
        while (section_level >= 0):
            if (pos_keywords_en[key_title] > headings_pos[section_level]):
                break
            section_level = section_level - 1

        if (pos_keywords_en[key_title] < headings_pos[0]):
            relevant_english_links.append(base_link)
        else:
            link_to_section = base_link + "#" + headings[
                section_level].replace(" ", "_")
            relevant_english_links.append(link_to_section)
        # chl position here
    end = time.process_time()
    print("Time taken to obtain mapping between keywords and page ids")
    print(end - start)

    print(
        "--------------------------------------------------------------------------------"
    )
    print("Wikipedia2vec execution begins")
    start = time.process_time()

    similarity_score = []
    entity_found = False
    try:
        title_vec = wiki2vec.get_entity_vector(title_en)
        entity_found = True
    except:
        entity_found = False
    count = 0
    if (entity_found):
        out = open("scores.txt", "w")
        for i in range(len(relevant_english)):
            score = 0
            try:
                key_vec = wiki2vec.get_entity_vector(relevant_english[i])
                dot = np.dot(title_vec, key_vec)
                norma = np.linalg.norm(title_vec)
                normb = np.linalg.norm(key_vec)
                cos = dot / (norma * normb)
                score = cos
            except:
                key_found = False
                count += 1
                score = 0
            similarity_score.append(score)
            out.write(
                str(relevant_english[i]) + "--> " + str(similarity_score[i]) +
                "\n")
        print("Len relevant english and similariy score")
        print(len(relevant_english))
        print(len(similarity_score))
        order = np.argsort(similarity_score)
        print(len(order))
        other_index = []
        other_index_link = []
        for i in range(len(order)):
            other_index.append(relevant_english[order[(len(order) - 1) - i]])
            other_index_link.append(
                relevant_english_links[order[(len(order) - 1) - i]])
        relevant_english = other_index
        relevant_english_links = other_index_link
    else:
        do_something = 0
        count = -1
        # the relevant english list, unchanged needs to be shown, as the Wikipedia2vec hasn't returned any entity vector for the title
        # keywords itself and therefore we cannot calculate the similarity values between the title and the extracted keywords
    print(len(relevant_english))
    # print(final)
    end = time.process_time()
    print("Time taken to get similarity scores")
    print(end - start)
    print("API calls completed")
    print("Keys not found = ")
    print(count)
    # out.close()

    # --------------------------------- Part Considering anchor text as keywords Ends-----------------------------

    ans = {}
    URL_en = "https://en.wikipedia.org/wiki/" + title_en
    ans['url_en'] = URL_en
    ans["keywords"] = unique_non_en
    ans["English_keywords"] = relevant_english
    ans['links'] = relevant_english_links
    temp = jsonify(ans)
    temp.status_code = 200

    return temp
Exemplo n.º 27
0
def cleaner_WikiExtractor(wikidoc):
    wikidoc.clean_text = WikiExtractor.clean(wikidoc.wiki_text)
    return wikidoc
Exemplo n.º 28
0
from time import strftime
from jsonrpclib.jsonrpc import ProtocolError
import logging
import sys
from pairseslib.pickling import pickleDump, pickleLoad

if __name__ == '__main__':

    logging.basicConfig(filename=os.path.join(cfg['home'], 'wikiread.log'),
                        level=logging.DEBUG,
                        format=cfg['logtimestampformat'])

    # Open the Wikipedia dump through wikidump
    wikipediaDump = wikiModel.Dump(
        '/Volumes/Data/wikidump/enwiki-20130304-pages-articles.xml', False,
        False)

    # Instantiate the English Wikipedia worder
    propertyWorder = EnglishWikipediaModule()

    text = wikipediaDump.get_page_contents_by_title('Bern').decode('utf-8')

    text = expandTemplates(text, propertyWorder)

    # Preliminary wiki markup cleanup
    text = WikiExtractor.clean(text)
    # Final wiki markup cleanup (turning text into a list of section titles and paragraphs)
    text = WikiExtractor.compact(text)

    for line in text:
        print(line.encode('utf-8'))
Exemplo n.º 29
0
def patternHarvester(title, propertyWorder, wikipediaDump):

    language = propertyWorder.getLanguage()
    sourceWiki = language

    print('Working on `%s`' % title)

    ############################################################################
    # 						Fetch triples for subject				   		   #

    print_n_flush('Querying DBPedia...')

    subjectIRI = expandIRI('dbpedia:' + title)

    subjectTriples = fetchSubjectTriples(subjectIRI, language, False, False)

    print 'OK'

    # 						End of "Fetch triples for subject"				   #
    ############################################################################

    # Obtain title for the article (i.e. primary subject name)
    primaryTitleLabels = getValuesForPredicate(subjectTriples, 'rdfs:label')

    # We are pretty sure right now is a singleton
    # (i.e. there is one triple for predicate rdfs:label)
    try:
        assert (len(primaryTitleLabels) == 1)
    except:
        projectedTitle = title.replace('_', ' ')
        primaryTitleLabels = {unicode(projectedTitle)}
        message = "Could not find a primary label for %s, will try %s" % (
            title, projectedTitle)
        print(message)

    titleLabel = primaryTitleLabels.pop()

    ############################################################################
    # 						Retrieve article for subject					   #
    print_n_flush('Retrieving article from Wikipedia...')

    # We don't do this anymore
    # article = getCurrentWikiArticleText(sourceWiki, title)

    # We do this instead, fetching the article from the wikipedia dump
    strTitleLabel = unidecode(titleLabel)

    try:
        rawArticle = wikipediaDump.get_page_contents_by_title(strTitleLabel)
    except KeyError:
        message = "Could not fetch the article for " + titleLabel
        logging.warning(message)
        print(message)
        return

    article = rawArticle.decode('utf-8')

    print 'OK'
    # 						End of "Retrieve article for subject"
    ############################################################################

    subjectWordings = set()
    subjectWordings.add(titleLabel)

    # Retrieve secondary names (obtained from redirects to the primary article)
    # and add them as subject labels
    subjectWordings |= otherLabels(subjectIRI, language)

    # Filter and get the labels for the classes the subject is an instance of
    # (e.g. Los Angeles would have "city" as a label to an object for a
    # rdf:type triple)
    subjectClasses = getLabelsForPredicate(subjectTriples, 'rdf:type')

    wordedClassLabels = set()

    for classLabel in subjectClasses:
        captlzd, uncptlzd = propertyWorder.getClassLabelWording(classLabel)
        wordedClassLabels.add(uncptlzd)
        wordedClassLabels.add(captlzd)

    subjectWordings |= wordedClassLabels

    ### Compute and annotate wordings for triple objects

    annotatedSubjectWordings = list()

    # Cycle through all wordings for the subject and get an annotation
    # for each one
    for subjectWording in subjectWordings:
        try:
            (root, words, graph) = annotateText(subjectWording)
        except AnnotationError:
            continue

        annotatedSubjectWordings.append((subjectWording, (root, words, graph)))

    ### Compute and annotate wordings for objects in each triple
    print_n_flush('Finding and annotating wordings for triple objects...')

    annotatedObjectWordings = list()

    predicateOccurrences = dict()

    for triple in subjectTriples:
        predicate = triple['p']['value']

        if predicate in ignored:
            continue

        if predicate not in predicateOccurrences:
            predicateOccurrences[predicate] = set()

        try:
            objectWording = getCommonWording(triple, propertyWorder)
        except CommonWordingNotFound:
            # TODO: Find out if any important data types are left out
            """
			if triple['p']['value'] not in notWorded:
				notWorded[triple['p']['value']] = list()
			notWorded[triple['p']['value']].append(triple)

			pprint(triple['p']['value'] + '::' + triple['o']['value'])
			pprint(triple)
			"""
            continue

        try:
            (root, words, graph) = annotateText(objectWording)
        except AnnotationError:
            continue

        annotatedObjectWordings.append(
            (objectWording, (root, words, graph, predicate)))

    ### END of templates expansion
    print 'OK'

    ### Expand relevant templates in the Wikipedia article
    print_n_flush('Expanding relevant templates...')
    article = expandTemplates(article, propertyWorder)

    print 'OK'
    #END# Templates expansion

    ### Wiki markup cleaning
    print_n_flush('Getting rid of wiki markup...')

    # Preliminary cleanup
    article = WikiExtractor.clean(article)
    # Final cleanup (turning text into a list of section titles and paragraphs)
    article = WikiExtractor.compact(article)

    print 'OK'
    #END# Wiki markup cleaning

    # Sentence counter
    i = 0
    j = -1

    for paragraph in article:
        """ Account for a bug in the PunktSentenceTokenizer when handling
		 	sentence-ending marks followed by a double quote mark """
        paragraph = paragraph.replace('?"', '? "')
        paragraph = paragraph.replace('!"', '! "')
        paragraph = paragraph.replace('."', '. "')

        #TODO: Language-agnostic sentence tokenizer
        sentences = tokenize_sentence(paragraph)

        for sentence in sentences:
            sentence = propertyWorder.adjustText(sentence)

            # Statistics
            for ow, (owRootWord, owWords, owGraph,
                     predicate) in annotatedObjectWordings:
                if ow in sentence:
                    predicateOccurrences[predicate].add(ow)

            i += 1

            # Get the graph for this sentence
            print_n_flush('PS')

            # Parse the sentence through the Stanford NLP Core Tools
            try:
                (sentenceR, sentenceW, sentenceG, sentence,
                 sentenceWData) = annotateText(sentence, True)
            except AnnotationError:
                continue

            legalNodeIndices = map(lambda x: int(x[x.rindex("-") + 1:]),
                                   sentenceG.nodes())

            rootNode = 'ROOT-0'

            # From here on, the initials "sw" refer to "subject wording"

            for sw, (swRootWord, swWords, swGraph) in annotatedSubjectWordings:

                try:
                    swRootWordIndex = matchWording(sentence, sentenceW,
                                                   sentenceG, legalNodeIndices,
                                                   sentenceWData, sw, swWords,
                                                   swGraph, swRootWord)
                except ValueError as e:
                    """No match found for wording in sentence"""
                    continue

                subjectTarget = swRootWord + '-' + unicode(swRootWordIndex)

                # Compute and generate subgraph for shortest path to Subject
                # s1 will be the nodes from root to subject
                try:
                    s1 = set(
                        shortestPathFromRoot(sentence, sentenceG,
                                             subjectTarget))
                except ShortestPathError:
                    continue

                # From here on, the initials "ow" refer to "object wording"

                # Compute and generate subgraph for shortest path to Object
                # s2 is the set of nodes from root to object
                for ow, (owRootWord, owWords, owGraph,
                         predicate) in annotatedObjectWordings:

                    try:
                        owRootWordIndex = matchWording(
                            sentence, sentenceW, sentenceG, legalNodeIndices,
                            sentenceWData, ow, owWords, owGraph, owRootWord)

                    except ValueError as e:
                        """No match found for wording in sentence"""
                        continue

                    objectTarget = owRootWord + '-' + unicode(owRootWordIndex)

                    if objectTarget == subjectTarget:
                        """ No use for this kind of pattern """
                        continue

                    try:
                        s2 = set(
                            shortestPathFromRoot(sentence, sentenceG,
                                                 objectTarget))
                    except ShortestPathError:
                        continue

                    # At this point, we definitely have a pattern

                    # Nodes in the spanning tree comprising solely the shortest
                    # paths to the subject and to the object
                    s = s1 | s2

                    # S is the aforementioned spanning tree
                    S = nx.DiGraph(sentenceG.subgraph(s), name=predicate)

                    anonRoot = unicode(cfg['roottag'] + '-0')
                    anonSubject = unicode(cfg['subjecttag'] + '-' +
                                          unicode(swRootWordIndex))
                    anonObject = unicode(cfg['objecttag'] + '-' +
                                         unicode(owRootWordIndex))

                    renamings = dict()

                    renamings[rootNode] = anonRoot
                    renamings[subjectTarget] = anonSubject
                    renamings[objectTarget] = anonObject

                    entities = list()
                    numerals = 0

                    try:
                        for node in S.nodes():
                            if node not in renamings.keys():
                                if propertyWorder.partOfProperNoun(node):
                                    """ The word may refer to an entity, in this 
									 	case let's abstract from the word and save a 	
										relation for this pattern"""
                                    index = int(node[node.rindex('-') + 1:])

                                    anonEntity = '%s%05d-%d' % (
                                        cfg['entitytagprefix'], len(entities),
                                        index)

                                    renamings[node] = anonEntity

                                    entityWording = associatedWording(
                                        sentence,
                                        node,
                                        sentenceG,
                                        sentenceWData,
                                        allowNestedWordingMatch=True)

                                    entities.append(
                                        (entityWording,
                                         getClasses(entityWording, language)))

                                elif isNumeric(node):
                                    index = int(node[node.rindex('-') + 1:])

                                    anonNumeral = '%s%05d-%d' % (
                                        cfg['numerictagprefix'], numerals,
                                        index)
                                    numerals += 1
                                    renamings[node] = anonNumeral

                    except AnnotationError:
                        continue

                    # First anonymize subject, object and entities
                    S = nx.relabel_nodes(S, renamings)

                    # Remove indices as well
                    indexlessNodes = map(lambda word: word[0:word.rindex("-")],
                                         S.nodes())

                    S = nx.relabel_nodes(S, dict(zip(S.nodes(),
                                                     indexlessNodes)))

                    if '' in S.nodes():
                        """	A bug in either the SCNLP or the python wrapper makes empty nodes out of
						 	schwas and other unicode chars that might be used as a diacritic"""
                        # TODO: Find a fix for this
                        message = 'Invalid dependencies for this sentence: ' + sentence
                        logging.warning(message)
                        print(message)
                        continue

                    # DOT representation of the graph
                    pydotS = nx.to_pydot(S).to_string().encode(
                        encoding='UTF-8', errors='strict')

                    pattern = Pattern(pydotS, predicate, entities, title, sw,
                                      ow, sentence)

                    try:
                        saveGraph(S, pattern.hash)
                    except (TypeError, UnicodeEncodeError):
                        # TODO: Fix this "TypeError: coercing to
                        # Unicode: need string or buffer, NoneType found" error
                        # also : "UnicodeEncodeError: 'ascii' codec can't encode character"
                        checkLog = True
                        logging.warning('A graph could not be saved: '
                                        'Sentence: ' + sentence + 'Nodes: ' +
                                        str(S.nodes()) + 'Edges: ' +
                                        str(S.edges(data=True)))
                        pass

    storePredicateOccurrences(title, predicateOccurrences)