示例#1
0
def WhenSearch(query):
    # Finds name of person (if applicable) in query
    potentialname = regex.name.search(query)
    if potentialname:
        nameInQuery = potentialname.group(0)
        hits = search.urls(query)
        for hit in hits:
            print hit["titleNoFormatting"]
            request = Request(hit["url"])
            page = urlopen(request).read()
            soup = BeautifulSoup(page, "html.parser")
            potentialnameInTitle = regex.name.search(hit["titleNoFormatting"])
            if potentialnameInTitle:
                nameInTitle = potentialnameInTitle.group(0)
                if nameInTitle == nameInQuery:
                    print "found it"
            else:
                hits = search.urls(query)
                request = Request(hits[0]["url"])
                page = urlopen(request).read()
                soup = BeautifulSoup(page, "html.parser")
    else:
        hits = search.urls(query)
        request = Request(hits[0]["url"])
        page = urlopen(request).read()
        soup = BeautifulSoup(page, "html.parser")
    # Saving the output of beautiful soup for debugging
    file = open("SoupOutput", "w")
    file.write(soup.get_text().encode("utf8"))
    # Basically finds action word in query (born, die, etc.)
    action = regex.FindActionInQuery(query)
    # Then from there finds key words related to action word, i.e. born -> birth, birthday, etc.
    keyWords = regex.FindKeyWords(action)
    searchRange = regex.FindSearchRange(keyWords, soup.get_text().encode("utf8"))
    searchRange = searchRange + regex.FindParentheses(soup.get_text().encode("utf8"))
    dates = []
    for searchText in searchRange:
        print searchText
        dates.append(regex.FindAmericanExtendedDate(searchText))
        dates.append(regex.FindAmericanCondensedDate(searchText))
        dates.append(regex.FindEuropeanExtendedDate(searchText))
    dates = filter(None, dates)
    if dates:
        print dates[0]
        return dates[0]
    else:
        return "Error: Answer not found"
示例#2
0
def WhoSearch(query):
    hits = search.urls(query)
    if hits:
        results = []
        for i in hits:
            results.append(i["url"])
        # results is now a list containing URLs for the top (usually 4) Google search results for the Who query.
        page = urllib2.urlopen(results[0]).read()  # may later add code to read other URLs as well
        soup = bs4.BeautifulSoup(page, "html.parser")  # BeautifulSoup parsing of URL of first Google result
        pagetext = soup.get_text().encode("utf8")
        nameresult = regex.name.search(pagetext)  # first potential "name" match within the BeautifulSoup text.
        if nameresult:  # if there is a match for a potential "name", let's check to make sure it's actually a name.
            # startind = 0
            print nameresult.group(0)  # prints potential match
            abspath = os.path.abspath(__file__)
            dname = os.path.dirname(abspath)
            os.chdir(dname)
            words = open("wordsEn.txt", "r").read()
            names = open("first-names-female.txt", "r").read() + open("first-names-male.txt", "r").read()
            name1words = str("\n" + nameresult.group(0).split(" ")[0] + "\n").lower()
            name2words = str("\n" + nameresult.group(0).split(" ")[1] + "\n").lower()
            name1names = str("\n" + nameresult.group(0).split(" ")[0] + "\n")
            name2names = str("\n" + nameresult.group(0).split(" ")[1] + "\n")
            # print "First name in words: " + str(name1words in words)
            # print "First name in names: " + str(name1names in names)
            # the purpose of this next loop is to skip false positives until it finds a result where the first two parts of the name are NOT [in words and not in names]
            while ((name1words in words) and not (name1names in names)) or (
                (name2words in words) and not (name2names in names)
            ):
                pagetext = pagetext[
                    (pagetext.find(nameresult.group(0)) + len(nameresult.group(0).split(" ")[0])) :
                ]  # deletes everything up to (and including) the first word of the current false positive.
                # print "length is: " + str(startind+len(nameresult.group(0)))
                # print "nameresult.group(0) is: " + str(nameresult.group(0))
                # print "Startind is: " + str(startind) + " Found in rest of text?: " + str(nameresult.group(0) in pagetext[startind+len(nameresult.group(0)):])
                nameresult = regex.name.search(pagetext)
                # print "The current name being considered is: " + str(nameresult.group(0))
                name1words = str("\n" + nameresult.group(0).split(" ")[0] + "\n").lower()
                name2words = str("\n" + nameresult.group(0).split(" ")[1] + "\n").lower()
                name1names = str("\n" + nameresult.group(0).split(" ")[0] + "\n")
                name2names = str("\n" + nameresult.group(0).split(" ")[1] + "\n")
                # print "First name: " + nameresult.group(0).split(" ")[0]
                # print "First name in names: " + str(name1names in names)
                # print "First name in words: " + str(name1words in words)
                # print "second name: " + nameresult.group(0).split(" ")[1]
                # print "second name in names: " + str(name2names in names)
                # print "second name in words: " + str(name2words in words)
                # print "________________"
            return nameresult.group(0)
    else:
        return "An error occurred: No results found. If Googling your query returns results, please wait a while and try entering it here again."