def wikipediaSearch( self, word = "iOS", lang = 'simple', maximumNumberOfResults = 1, save = False ):
     '''
         (get) Wikipedia (corpus (documents) ) (by) Search
         Recibe una palabra: 'word', busca 'word' en Wikipedia y guarda los articulos en 'self.corpus'
         '''
     wiki = Wikipedia(lang)
     
     resultadosdebusqueda = wiki.search(word , 1, maximumNumberOfResults)
     
     numerodearticulos = len(resultadosdebusqueda)
     
     for resultado in resultadosdebusqueda:
         try:
             raw = wiki.article(resultado['title'])
         except:
             raw = None
         if raw:
             wiki2plain = Wiki2Plain(raw)
             content = wiki2plain.text
             if save:
                 f = open( resultado['title'] + '.txt', 'w+')
                 f.write(content)
                 f.close()
             self.documents.append(content)
         #os.system('clear')
         
         #Imprime avance del metodo.
         '''
示例#2
0
 def input_error(self):
     cwd = os.getcwd()
     configuration_file = os.path.join(cwd,"config", "app_config.json")
     try:
         wiki = Wikipedia('tests/test.input8', configuration_file)
         wiki.parse_input()
         self.assertEqual(False,"test faile")
     except Exception as e:
         print e
         self.assertEqual(True,True)
示例#3
0
class WikipediaSearch(object):
    def __init__(self,mlDb='ml',maxResult=10, overwrite=False,targetDir='./'):
        """
            Init method,
            mlDb : mongodb database
            maxResult : amount of result that search for each program, default 10
            overwrite : boolean parameter that overwrite if program have already wikipedia results, default False
            targetDir: directory where save the wikipedia articles downloaded
        """
        conn= Connection()
        db = conn[mlDb]
        self.programs = db.programs
        lang = 'en'
        self.wiki = Wikipedia(lang)
        self.maxResult = maxResult
        self.overwrite = overwrite
        self.targetDir = targetDir
        logging.info('Mongodb initialized in %s db for MovieLens' % mlDb)


    def fill(self,maxProgramNumber=10):
        """This method fill program in mongodb backend, maxProgramNumber parameter
        determine how many program will fill with wikipedia results
        """
        wikipediaResultsField = 'wikipediaResults'
        wikipediaSelectedField = 'wikipediaSelected'
        for program in self.programs.find().limit(maxProgramNumber):
            if self.overwrite  or not(wikipediaSelectedField in program):
                results = self.wiki.search2(program['name'].encode('utf-8'),self.maxResult)
                print results

                if len(results)>0:
                    selected=results[0]['title']
                    self.programs.update({'_id':program['_id']}, {"$set": {wikipediaResultsField:results, wikipediaSelectedField:selected}}, upsert=False)


    def downloadArticles(self,maxProgramNumber=10):
            """This method fill program in mongodb backend, maxProgramNumber parameter
            determine how many program will fill with wikipedia results
            """
            print 'running downloadArticles'
            wikipediaResultsField = 'wikipediaResults'
            wikipediaSelectedField = 'wikipediaSelected'
            for program in self.programs.find().limit(maxProgramNumber):
                # print program['name']
                # print program['wikipediaSelected']
                if wikipediaSelectedField in program:
                    filename = program[wikipediaSelectedField].encode('utf-8').replace (" ", "").replace ("/", "").replace (":", "")+".txt"
                    print 'writing: '+self.targetDir+filename
                    f = open(self.targetDir+filename,'w')
                    rawArticle = self.wiki.article(program[wikipediaSelectedField].encode('utf-8'))
                    f.write(rawArticle)
                    f.close()
示例#4
0
    def on_get(self, req, resp, search_term):
        """Handle search requests."""
        w = Wikipedia()

        try:
            resp.body = json.dumps(w.search([search_term]))
            resp.status = falcon.HTTP_200
        except Exception as e:
            resp.body = json.dumps({
                "Error": "Something went wrong, sorry!",
                "Exception": e
            })
            resp.status = falcon.HTTP_500
示例#5
0
def add_entry():
    wiki = Wikipedia(url_entry.get())
    name, birthday, deathday = wiki.scrap_person()
    query = name + "\n" + birthday + "\n" + deathday
    send_person("1", query)
    message = recieve_message()
    message = message['data'].decode('utf-8')
    if message == "OK":
        messagebox.showinfo("Operation Successful!",
                            "Person was successfully added to the database")

    else:
        messagebox.showwarning("Operation Failed.",
                               "Person exists on the database already.")
示例#6
0
def wiki_extract(article, lang='pt'):

    wiki = Wikipedia(lang)
    try:
        raw = wiki.article(article)
    except:
        raw = None

    content = ""

    if raw:
        wiki2plain = Wiki2Plain(raw)
        content = wiki2plain.text

    content_dict = {"resumo": ""}
    current_pointer = content_dict
    parent_pointer = content_dict
    first = True
    for line in content.splitlines():
        line = line.strip()
        if line != "":
            if line.startswith("==") and not line.startswith("==="):
                title = __get_title(line)
                content_dict[title] = {"text": ""}
                parent_pointer = content_dict[title]
                current_pointer = content_dict[title]
                first = False
            elif line.startswith("==="):
                title = __get_title(line)
                parent_pointer[title] = {"text": ""}
                current_pointer = parent_pointer[title]
            else:
                if first:
                    content_dict["resumo"] = "%s<p>%s</p>" % (
                        current_pointer["resumo"], line)
                else:
                    current_pointer["text"] = "%s<p>%s</p>" % (
                        current_pointer["text"], line)

    return content_dict
示例#7
0
def wiki_extract(article, lang='pt'):

    wiki = Wikipedia(lang)
    try:
        raw = wiki.article(article)
    except:
        raw = None

    content = ""

    if raw:
        wiki2plain = Wiki2Plain(raw)
        content = wiki2plain.text

    content_dict = {"resumo": ""}
    current_pointer = content_dict
    parent_pointer = content_dict
    first = True
    for line in content.splitlines():
        line = line.strip()
        if line != "":
            if line.startswith("==") and not line.startswith("==="):
                title = __get_title(line)
                content_dict[title] = {"text": ""}
                parent_pointer = content_dict[title]
                current_pointer = content_dict[title]
                first = False
            elif line.startswith("==="):
                title = __get_title(line)
                parent_pointer[title] = {"text": ""}
                current_pointer = parent_pointer[title]
            else:
                if first:
                    content_dict["resumo"] = "%s<p>%s</p>" % (current_pointer["resumo"], line)
                else:
                    current_pointer["text"] = "%s<p>%s</p>" % (current_pointer["text"], line)


    return content_dict
示例#8
0
    def no_answer_question(self):
        try:
            cwd = os.getcwd()
            configuration_file = os.path.join(cwd,"config", "app_config.json")
            wiki = Wikipedia('tests/err', configuration_file)
            wiki.parse_input()
            wiki.calculate_and_print_answers()
            self.assertEqual(False,"test faile")
        except Exception as e:
            print e
            self.assertEqual(True,True)

        try:
            wiki = Wikipedia('tests/test.input8', configuration_file)
            q = QuestionAnswer('why is the earth flat?')
            self.assertEqual( False, q.find_answer(wiki.paragraph, wiki.appconfig) )
        except Exception as e:
            self.assertEqual(False,"test faile")
示例#9
0
 def __init__(self,mlDb='ml',maxResult=10, overwrite=False,targetDir='./'):
     """
         Init method,
         mlDb : mongodb database
         maxResult : amount of result that search for each program, default 10
         overwrite : boolean parameter that overwrite if program have already wikipedia results, default False
         targetDir: directory where save the wikipedia articles downloaded
     """
     conn= Connection()
     db = conn[mlDb]
     self.programs = db.programs
     lang = 'en'
     self.wiki = Wikipedia(lang)
     self.maxResult = maxResult
     self.overwrite = overwrite
     self.targetDir = targetDir
     logging.info('Mongodb initialized in %s db for MovieLens' % mlDb)
示例#10
0
    def testAll(self):
        try:
            cwd = os.getcwd()
            configuration_file = os.path.join(cwd,"config", "app_config.json")
            wiki = Wikipedia('test.input', configuration_file)
            wiki.parse_input()
            wiki.calculate_and_print_answers()
            self.assertEqual(True,True)

        except Exception as e:
         print e
         traceback.print_exc(file=sys.stdout)
         self.assertEqual(True,False)
示例#11
0
 def test_dataloader(self):
     return Wikipedia("TEST", self.tokenizer,
                      transform=self.eval_transform).get_dataloader(
                          batch_size=self.hparams.bs, shuffle=False)
示例#12
0
文件: wiki.py 项目: cmry/gomi
 def wiki(self, message):
     query, wiki = self.q.search(message), Wikipedia('en')
     try:
         return self.q.cut(Wiki2Plain(wiki.article(query), query).text)
     except:
         return "The Enrichment Center regrets to inform you that this next test is impossible."
示例#13
0
文件: crawler.py 项目: r0pchainz/wiki
''' A script used to randomly collect Wikipedia articles '''

''' Parse command line arguments '''

parser = argparse.ArgumentParser()

parser.add_argument(
	"time_limit",
	type=int,
	help="crawling time limit in seconds"
)
parser.add_argument(
	"subdomain",
	type=str,
	help="crawling subdomain"
)
parser.add_argument(
	"-s",
	"--summary",
	action="store_true",
	help="collect summaries instead of full articles"
)

args = parser.parse_args()

''' Start crawling '''

wiki = Wikipedia(args.subdomain, args.summary)
wiki.crawl(args.time_limit)
示例#14
0
def main():

    data = "../resources/SOusers-Mar13.csv"  # File containing SO user dump
    results = "../resources/features3.csv"  # File where features will be stored
    picPath = "../resources/SOpictures/"  # Directory where pictures will be downloaded

    fr = open(os.path.join(data), 'rb')
    fw = open(os.path.join(results), 'ab')

    if _RANDOM:
        reader = RandomReader(fr)
    else:
        reader = UnicodeReader(fr)

    writer = UnicodeWriter(fw)

    queue = Queue()
    if _FACE:
        faceDetector = FaceDetector()

    threads = []
    SOhashes = {}  # Dictionary of user's hashes

    # Use multiple threads to download and get information
    for i in xrange(10):
        threads.append(Downloader(queue))
        threads[-1].start()

    idx = 0
    size = 4500  # Number of subjects

    for row in reader:
        if idx < size:
            so_uid = row[0]
            so_hash = row[2]
            if (not (SOhashes.has_key(so_hash))):
                SOhashes[so_hash] = so_uid
                if (not isDefaultGravatarPic(so_hash)):
                    data = [so_uid]
                    if _VISUAL_FEATURES:

                        # Download picture
                        filepath = os.path.join('%s%d.jpg' %
                                                (picPath, int(so_uid)))
                        if not os.path.isfile(filepath):
                            queue.put(
                                ('http://www.gravatar.com/avatar/%s' % so_hash,
                                 filepath))
                            time.sleep(2)

                        # Load picture
                        pic = picUtils.loadPicture(filepath)

                        if _FACE:
                            if faceDetector.isFrontFace(
                                    pic) or faceDetector.isProfileFace(pic):
                                data.append(str(True))
                            else:
                                data.append(str(False))

                        if _MOST_COMMON_COLORS:
                            _, f1, _, f2 = picUtils.mostCommonColor(pic)
                            data.append(str(f1 + f2))

                        if _NBCOLORS:
                            data.append(str(picUtils.getNbOfColors(pic)))

                        if _FARTHEST_NEIGHBOR:
                            F1 = picUtils.farthestNeighborMetric(pic, 10)
                            F2 = picUtils.farthestNeighborMetric(pic, 200)
                            data.append(str(F1))
                            data.append(str(F2))
                            if F1 != 0:
                                data.append(str(F2 / F1))
                            else:
                                data.append('?')

                        if _AVERAGE_SATURATION:
                            data.append(str(picUtils.avgSaturation(pic)))

                        if _THRESHOLD_BRIGHTNESS:
                            data.append(str(picUtils.threBrightness(pic, 0.2)))

                    if _GOOGLE:
                        gi = GoogleImage('http://www.gravatar.com/avatar/%s' %
                                         so_hash)
                        bestGuess = gi.getBestGuess()
                        if bestGuess:
                            bestGuess = bestGuess.encode('utf8')
                            data.append(bestGuess)
                            if _WIKIPEDIA:
                                gs = GoogleSearch("%s site:en.wikipedia.org" %
                                                  bestGuess)
                                wikiTitlePage = gs.getWikipediaTitlePage()
                                if wikiTitlePage:
                                    wiki = Wikipedia(wikiTitlePage)
                                    wiki.categoryGraph(4)
                                    nbCats = 10
                                    i = 0
                                    cats = wiki.sortGraphByDegree()
                                    while i < nbCats and i < len(cats):
                                        data.append(str(cats[i]))
                                        i += 1

                    # Write all information collected in the csv file
                    try:
                        print data
                        writer.writerow(data)
                        idx += 1
                    except:
                        print "Error with data"
        else:
            break
    fr.close()
    fw.close()

    # If here, download finished. Stop threads
    for i in xrange(10):
        queue.put((None, None))
示例#15
0
from wikipedia import Wikipedia
import argparse
''' A script used to randomly collect Wikipedia articles '''
''' Parse command line arguments '''

parser = argparse.ArgumentParser()

parser.add_argument("how_many_pages", type=int, help="crawling articles limit")
parser.add_argument("subdomain", type=str, help="crawling subdomain")

args = parser.parse_args()
''' Start crawling '''

wiki = Wikipedia(args.subdomain)
wiki.crawl(args.how_many_pages)
示例#16
0
 def val_dataloader(self):
     print("Using Wikipedia")
     return Wikipedia("VAL", self.tokenizer,
                      transform=self.eval_transform).get_dataloader(
                          batch_size=self.hparams.bs, shuffle=False)
示例#17
0
 def train_dataloader(self):
     print("Using Wikipedia")
     return Wikipedia("TRAIN",
                      self.tokenizer,
                      transform=self.train_transform).get_dataloader(
                          batch_size=self.hparams.bs, shuffle=True)
示例#18
0
文件: wiki.py 项目: kaurbhavnit/Lexis
from wikipedia import Wikipedia
from wiki2plain import Wiki2Plain
import io

content = {}
lang = 'simple'
wiki = Wikipedia(lang)

try:
    raw = wiki.article('Arizona')
except:
    raw = None

if raw:
    wiki2plain = Wiki2Plain(raw)
    content = wiki2plain.text

print content
model_file = io.open("per.txt", "wb")
model_file.write("" + content)
model_file.close()
model_file1 = io.open("per1.txt", "wb")
i = 0
with open("per.txt", "r") as f:
    while (i < 2):
        line = f.readline()
        if "{" in line or "|" in line or "}" in line:
            print("")
        else:
            i += 1
            model_file1.write("" + line)
示例#19
0
from wikipedia import Wikipedia
from Wiki2Plain import Wiki2Plain

if __name__ == '__main__':

    lang = 'simple'
    wiki = Wikipedia(lang)

    try:
        raw = wiki.article('Uruguay')
        print(raw)
    except:
        raw = None

    if raw:
        wiki2plain = Wiki2Plain(bytes(raw).decode("utf-8"))
        content = wiki2plain.text
        print(content)
示例#20
0
 def  __init__(self):
     self.wikipedia_caller = Wikipedia()
     self.wikidata_caller = Wikidata()
     self._cate_info = {}
		answer = row['answerB']
	elif row['correctAnswer'] == 'C':
		answer = row['answerC']
	else:
		answer = row['answerD']
	if answer not in AnswerSet:
		AnswerSet.append(answer)

for row in DictReader(open('sci_test.csv')):
	for choice in ['answerA', 'answerB', 'answerC', 'answerD']:
		if row[choice] not in AnswerSet:
			AnswerSet.append(row[choice])


lang = 'simple'
wiki = Wikipedia(lang)

counts = 0
n_answer = 0
o = DictWriter(open("wiki.csv", 'wb'), ["answer",  "question"])
o.writeheader()

counts = 0
for answer in AnswerSet:
	print n_answer
	n_answer += 1
	try:
	    raw = wiki.article(answer)
	except:
		raw = None
示例#22
0
class WikiCategory(object):
    """
    the class used to find categories for entities
    """
    def  __init__(self):
        self.wikipedia_caller = Wikipedia()
        self.wikidata_caller = Wikidata()
        self._cate_info = {}




    def _get_single_entity_cate(self,entity):
        """
        find the category info for a single entity

        Updates
        -----------------------------------------
        cate_info: a dict categories of the entities

                    key: entity name
                    value: a list of categories of the key

        """

        if entity in self._cate_info:
            return
        else:
            self._cate_info[entity] = []
        try:
            entity_name = self.wikipedia_caller.get_entity_name(entity)
        except wikiexceptions.ResultErrorException:
            print "cannot find an entity for name %s" %entity
            self._cate_info[entity] = None
        try:
            entity_info = self.wikidata_caller.get_entity_info_by_name(entity_name)
        except wikiexceptions.NoClassException:
            print "entity %s has no class info" %entity_name
            self._cate_info[entity] = None

        else:
            for cid in entity_info['class_info']:
                self._cate_info[entity].append(entity_info['class_info'][cid])


    def _get_cate_for_entity_iterable(self,entity_iterable):
        """
        find the categories info for an Iterable of entities

        Updates
        -----------------------------------------
        cate_info: a dict categories of the entities

                    key: entity name
                    value: a list of categories of the key

        """

        for entity in entity_iterable:
            self._get_single_entity_cate(entity)
            time.sleep(5)

    
    def get_cates(self,entitiy_input):
        """
        Find the categories info for entities. Do type check
        first and support both string and Iterable(except dict)
        input 

        Updates:
        -----------------------------------------
        cate_info: a dict categories of the entities

                    key: entity name
                    value: a list of categories of the key

        """
        input_type = type(entitiy_input)
        if isinstance(entitiy_input,Iterable):
            if type(entitiy_input) == str:
                self._get_single_entity_cate(entitiy_input)
            elif input_type == dict:
                raise TypeError("unsupported type %s" %(input_type) )
            else:
                self._get_cate_for_entity_iterable(entitiy_input)

        else:
            raise TypeError("unsupported type %s" %(input_type) )


    @property
    def cate_info(self):
        return self._cate_info
示例#23
0
from wikipedia import isQuestion, Wikipedia

browser1 = Browser()
speaker1 = Speaker()
userCommand = ""
while (True):
    speaker1.printAndSpeak("How may I help you, sir?")
    # userCommand = handleSpeech(userCommand)
    userCommand = input("Enter your command : ")
    print("Your command :", userCommand)
    if ("bye" in userCommand.lower()):
        speaker1.speak("Bye sir, see you next time")
        break
    elif (userCommand == ""):
        speaker1.printAndSpeak("Sorry sir, couldn't understand audio")
    elif (userCommand.lower() == "how are you doing"):
        speaker1.speak(
            random.choice(
                ["I am fine", "Incredible, Sir", "I am feeling great"]))
    elif ("open" in userCommand.lower()):
        browser1.open(userCommand)
    elif ("weather" in userCommand.lower()):
        displayWeather(userCommand)
    elif (isQuestion(userCommand)):
        wiki = Wikipedia(userCommand)
        wiki.findKeyword()
        if (wiki.doesPageExist()):
            wiki.displaySummary()
        else:
            speaker1.printAndSpeak("Sorry sir, couldn't get information ")
示例#24
0
def main():
    
    data = "../resources/SOusers-Mar13.csv" # File containing SO user dump
    results = "../resources/features3.csv" # File where features will be stored
    picPath = "../resources/SOpictures/" # Directory where pictures will be downloaded
    
    fr = open(os.path.join(data), 'rb')
    fw = open(os.path.join(results), 'ab')
    
    if _RANDOM:
        reader = RandomReader(fr)
    else:
        reader = UnicodeReader(fr)
        
    
    writer = UnicodeWriter(fw)
    
    queue = Queue()
    if _FACE:
        faceDetector = FaceDetector()
    
    threads = []
    SOhashes = {} # Dictionary of user's hashes
        
    # Use multiple threads to download and get information
    for i in xrange(10):
        threads.append(Downloader(queue))
        threads[-1].start()
        
    
    idx = 0
    size = 4500 # Number of subjects
    
    for row in reader:
        if idx < size:
            so_uid = row[0]            
            so_hash = row[2]
            if(not (SOhashes.has_key(so_hash))):
                SOhashes[so_hash] = so_uid
                if(not isDefaultGravatarPic(so_hash)):
                    data = [so_uid]
                    if _VISUAL_FEATURES:
                          
                        # Download picture
                        filepath = os.path.join('%s%d.jpg' % (picPath,int(so_uid)))
                        if not os.path.isfile(filepath):
                            queue.put(('http://www.gravatar.com/avatar/%s' % so_hash, filepath))
                            time.sleep(2)
                              
                        # Load picture
                        pic = picUtils.loadPicture(filepath)
                      
                        if _FACE:
                            if faceDetector.isFrontFace(pic) or faceDetector.isProfileFace(pic):
                                data.append(str(True))
                            else:
                                data.append(str(False))
                          
                        if _MOST_COMMON_COLORS:
                            _, f1, _, f2 = picUtils.mostCommonColor(pic)
                            data.append(str(f1 + f2))
                              
                        if _NBCOLORS:
                            data.append(str(picUtils.getNbOfColors(pic)))
                              
                        if _FARTHEST_NEIGHBOR:
                            F1 = picUtils.farthestNeighborMetric(pic, 10)
                            F2 = picUtils.farthestNeighborMetric(pic, 200)
                            data.append(str(F1))
                            data.append(str(F2))
                            if F1 != 0:
                                data.append(str(F2/F1))
                            else:
                                data.append('?')
                         
                        if _AVERAGE_SATURATION:
                            data.append(str(picUtils.avgSaturation(pic)))
                          
                        if _THRESHOLD_BRIGHTNESS:
                            data.append(str(picUtils.threBrightness(pic, 0.2)))
                          
                    if _GOOGLE:
                        gi = GoogleImage('http://www.gravatar.com/avatar/%s' % so_hash)
                        bestGuess = gi.getBestGuess()
                        if bestGuess:
                            bestGuess = bestGuess.encode('utf8')
                            data.append(bestGuess)
                            if _WIKIPEDIA:
                                gs = GoogleSearch("%s site:en.wikipedia.org" % bestGuess)
                                wikiTitlePage = gs.getWikipediaTitlePage()
                                if wikiTitlePage:
                                    wiki = Wikipedia(wikiTitlePage)
                                    wiki.categoryGraph(4)
                                    nbCats = 10
                                    i = 0
                                    cats = wiki.sortGraphByDegree()
                                    while i<nbCats and i < len(cats):
                                        data.append(str(cats[i]))
                                        i += 1
                     
                      
                    # Write all information collected in the csv file
                    try:
                        print data
                        writer.writerow(data)
                        idx += 1
                    except:
                        print "Error with data"
        else:
            break
    fr.close()
    fw.close()
    
    # If here, download finished. Stop threads
    for i in xrange(10):
        queue.put((None, None))
示例#25
0
import sys
from wikipedia import Wikipedia
from wiki2plain import Wiki2Plain

lang = 'en'
wiki = Wikipedia(lang)

try:
    articleName = str(sys.argv[1])#'Uruguay'
    raw = wiki.article(articleName)
except:
    raw = None

if raw:
    wiki2plain = Wiki2Plain(raw)
    f = open('../corpus/' + articleName + '.txt', 'w')
    f.write(wiki2plain.text)
    #content = wiki2plain.text
    #print(wiki2plain.text)
示例#26
0
#!/usr/bin/python

import sys
import os
import traceback
from wikipedia import Wikipedia


if __name__ == '__main__':
    if len(sys.argv) < 2:
        print "usage : python main.py <input-file-name>"
        sys.exit(1)

    input_filename = sys.argv[1]
    cwd = os.getcwd()
    configuration_file = os.path.join(cwd,"config", "app_config.json")
    if not os.path.isfile(configuration_file):
        print "cannot access file ", configuration_file
        sys.exit(1)

    try:
        wiki = Wikipedia(input_filename, configuration_file)        
        wiki.parse_input()
        wiki.calculate_and_print_answers()
    except Exception as e:
         print e
         #traceback.print_exc(file=sys.stdout)