예제 #1
0
 def run(self,q):# q is the query
     urlsBing=[]
     engine = Bing(license=None) # Enter your license key.
     for i in range(1,11):
         for result in engine.search(q, type=SEARCH, start=i):
             urlsBing.append(result.url)
     return urlsBing
    def get_urls(self, q = "", n = 1, limit = 1):
        url = []
        reload(sys)
        sys.setdefaultencoding(GOOGLE_API_ENCODING)
        engine_google = Bing(license=BING_API_KEY, language=BING_API_LANG)
        for i in range(1, (n + 1)):
            for result in engine_google.search(q, start=i, count=10, type=SEARCH, cached=False):
                url.append(result.url)

        return url[:limit]
예제 #3
0
 def get_bing_entries(self, search, nb):
     bing = Bing(language=self.dico_lang[self.language])
     entries = []
     for result in bing.search(search, start=1, count=nb, cached=False):
         entry_input = Input(result.text)
         annotations = {
             'source' : 'Bing',
             'title': result.title,
             'url': result.url,
             'search' : search,
         }
         entry_input.segments[0].annotations.update(annotations)
         entries.append(entry_input)
     return entries
예제 #4
0
 def get_bing_entries(self, search, nb):
     bing = Bing(language=self.dico_lang[self.language])
     entries = list()
     for result in bing.search(search, start=1, count=nb, cached=False):
         entry_input = Input(result.text)
         annotations = {
             'source': 'Bing',
             'title': result.title,
             'url': result.url,
             'search': search,
         }
         segment = entry_input[0]
         segment.annotations.update(annotations)
         entry_input[0] = segment
         entries.append(entry_input)
     return entries
예제 #5
0
파일: web.py 프로젝트: antiface/pynlpl
def bingcorpsearch(word,concfilter = '', extraquery='',license=None, start=1, count=50):
    """Searches the web for sentences containing a certain keyword, and possibly a co-occurence word. Generator yielding (leftcontext,word,rightcontext,url) tuples.
       First queries Google, and then retrieves the pages of the top search results.
       Uses 'pattern' (CLiPS, Antwerpen University)
       """
    if not concfilter:
        query = word
    else:
        query = word + ' ' + concfilter
    if extraquery:
       query += ' ' + extraquery

    engine = Bing(license=license)
        
    processed = {}
    
    for result in engine.search(query, start=start,count=count):
        if not result.url in processed:
            processed[result.url] = True
            try:
                content = plaintext(result.download())
            except:
                continue
                
            begin = 0
            wordindex = None
            wordlength = 0
            concindex = None            
            for i in range(1,len(content)):
                if content[i] == '.' or content[i] == '?' or content[i] == '!' or content[i] == '\n':
                    if wordindex >= begin and ((concfilter and concindex >= begin) or (not concfilter)):
                        if len(content[begin:wordindex].strip()) > 5 or len(content[wordindex+wordlength:i+1].strip()) > 5:
                            yield (content[begin:wordindex].strip(), content[wordindex:wordindex+wordlength].strip(), content[wordindex+wordlength:i+1], result.url)
                    wordindex = concindex = None
                    begin = i + 1
                if len(word)+i <= len(content) and content[i:i+len(word)].lower() == word.lower():
                    wordindex = i
                    wordlength = len(word)
                    for j in range(len(word),len(content)):                        
                        if i+j < len(content) and (content[i+j] == ' ' or  content[i+j] == '?' or content[i+j] == '!' or content[i+j] == '\n'):
                            wordlength = j
                            break                                                                
                if concfilter and content[i:len(concfilter)].lower() == concfilter.lower():
                    concindex = i
예제 #6
0
def novelty(word):
    
    """ Returns the novelty of the given word as a value 0.0-1.0 (1.0 = 100% novel).
    """
    
    engine = Bing() # Google(license="...")
    
    # Get the number of search results that mention the given word.
    # http://www.clips.ua.ac.be/pages/pattern-web#services
    count = engine.search(word, cached=True).total
    
    # Note: we should cached=False to get the most up-to-date count.
    
    # It would be nice if this number was relative (0.0-1.0),
    # then we could represent novelty as a percentage,
    # based on the number of existing web pages that mention the word.
    # Here are some raw numbers:
    
    # - "and"                      : 1730000000
    # - "new york"                 : 94700000
    # - "tree"                     : 78200000
    # - "justin bieber"            : 7680000
    # - "computational creativity" : 5330000
    # - "zombification"            : 126000
    # - "zombification machine"    : 37000
    # - "zombology"                : 11100
    # - "zombeliever"              : 11
    # - "zombriefing"              : 0
    # - "zombifractor"             : 0
    
    # So, it looks like common words are mentioned thousands of times,
    # while invented words are mentioned dozens of times.
    
    # We'll cut off the result count above 100
    # (= anything mentioned 100x times on the net is not novel).
    count = min(count, 100)
    
    # And then relativize the value:
    count = 1.0 - count * 0.01
    
    return count
예제 #7
0
def novelty(word):
    """ Returns the novelty of the given word as a value 0.0-1.0 (1.0 = 100% novel).
    """

    engine = Bing()  # Google(license="...")

    # Get the number of search results that mention the given word.
    # http://www.clips.ua.ac.be/pages/pattern-web#services
    count = engine.search(word, cached=True).total

    # Note: we should cached=False to get the most up-to-date count.

    # It would be nice if this number was relative (0.0-1.0),
    # then we could represent novelty as a percentage,
    # based on the number of existing web pages that mention the word.
    # Here are some raw numbers:

    # - "and"                      : 1730000000
    # - "new york"                 : 94700000
    # - "tree"                     : 78200000
    # - "justin bieber"            : 7680000
    # - "computational creativity" : 5330000
    # - "zombification"            : 126000
    # - "zombification machine"    : 37000
    # - "zombology"                : 11100
    # - "zombeliever"              : 11
    # - "zombriefing"              : 0
    # - "zombifractor"             : 0

    # So, it looks like common words are mentioned thousands of times,
    # while invented words are mentioned dozens of times.

    # We'll cut off the result count above 100
    # (= anything mentioned 100x times on the net is not novel).
    count = min(count, 100)

    # And then relativize the value:
    count = 1.0 - count * 0.01

    return count
예제 #8
0
from pattern.web import Bing, IMAGE
import urllib

engine = Bing()

for counter, result in enumerate(engine.search('meme', type=IMAGE)):
    try:
        urllib.urlretrieve(result.url, "images/%s.jpg" % counter)
    except:
        pass
예제 #9
0
# "X IS MORE IMPORTANT THAN Y"
# Here is a rough example of how to build a web miner.
# It mines comparative statements from Bing and stores the results in a table,
# which can be saved as a text file for further processing later on.

# Pattern matching also works with Sentence objects from the MBSP module.
# MBSP's parser is much more robust (but also slower).
#from MBSP import Sentence, parse

q = '"more important than"'          # Bing search query
p = "NP (VP) more important than NP" # Search pattern.
p = Pattern.fromstring(p)
d = Datasheet()

engine = Bing(license=None)
for i in range(1): # max=10
    for result in engine.search(q, start=i+1, count=100, cached=True):
        s = result.description
        s = plaintext(s)
        s = Sentence(parse(s))
        for m in p.search(s):
            a = m.constituents(constraint=0)[-1] # Left NP.
            b = m.constituents(constraint=5)[ 0] # Right NP.
            d.append((
                a.string.lower(), 
                b.string.lower()))

pprint(d)

print
print len(d), "results."
예제 #10
0
    engine = Google(license=None)
    for i in range(1,10):
        for result in engine.search(match, type=SEARCH, start=i):
              print plaintext(result.description)
              targetfile.write(plaintext(result.description))
              targetfile.write('\n')

def bing_search(match):
    engine = Bing
    for i in range(1,10):
        for result in engine.search(match, type=SEARCH, start=i):
              print plaintext(result.description)

engine = Bing() # Enter your license key.
for i in range(1,15):
    for result in engine.search('holy', type=SEARCH, start=i):
        print plaintext(result.description)
        print
#google_search(u'شیر مادر', milkfile)
#google_search(u'شیر وحشی', lionfile)
#google_search(u'شیر آب', tapfile)
##article =  engine.search(match)
#print article.title
#for link in  article.links:
#    print link
#    #subarticle = engine.search(link)
#    url = URL(link)
#    result = Result(url)
#    print result.download()

#    print item
예제 #11
0
파일: web.py 프로젝트: zzmjohn/pynlpl
def bingcorpsearch(word,
                   concfilter='',
                   extraquery='',
                   license=None,
                   start=1,
                   count=50):
    """Searches the web for sentences containing a certain keyword, and possibly a co-occurence word. Generator yielding (leftcontext,word,rightcontext,url) tuples.
       First queries Google, and then retrieves the pages of the top search results.
       Uses 'pattern' (CLiPS, Antwerpen University)
       """
    if not concfilter:
        query = word
    else:
        query = word + ' ' + concfilter
    if extraquery:
        query += ' ' + extraquery

    engine = Bing(license=license)

    processed = {}

    for result in engine.search(query, start=start, count=count):
        if not result.url in processed:
            processed[result.url] = True
            try:
                content = plaintext(result.download())
            except:
                continue

            begin = 0
            wordindex = None
            wordlength = 0
            concindex = None
            for i in range(1, len(content)):
                if content[i] == '.' or content[i] == '?' or content[
                        i] == '!' or content[i] == '\n':
                    if wordindex >= begin and (
                        (concfilter and concindex >= begin) or
                        (not concfilter)):
                        if len(content[begin:wordindex].strip()) > 5 or len(
                                content[wordindex + wordlength:i +
                                        1].strip()) > 5:
                            yield (content[begin:wordindex].strip(),
                                   content[wordindex:wordindex +
                                           wordlength].strip(),
                                   content[wordindex + wordlength:i + 1],
                                   result.url)
                    wordindex = concindex = None
                    begin = i + 1
                if len(word) + i <= len(content) and content[i:i +
                                                             len(word)].lower(
                                                             ) == word.lower():
                    wordindex = i
                    wordlength = len(word)
                    for j in range(len(word), len(content)):
                        if i + j < len(content) and (
                                content[i + j] == ' ' or content[i + j] == '?'
                                or content[i + j] == '!'
                                or content[i + j] == '\n'):
                            wordlength = j
                            break
                if concfilter and content[i:len(concfilter)].lower(
                ) == concfilter.lower():
                    concindex = i
예제 #12
0
text = sys.stdin.read()
invention = machine.Invention(text)
engine = Bing(license=None)

#the following searches for patent illustrations on bing, using a generated noun from each description of the illustration
#search_base = "patent illustration "
#for i, illustration in enumerate(invention.unformatted_illustrations):
#nouns = [word for word, pos in tag(illustration) if pos == 'NN']
#if len(nouns) > 0:
#search_string = search_base + random.choice(nouns)#' '.join(nouns)
#print "searching for: " + search_string
#for j, result in enumerate(engine.search(search_string, type=IMAGE, count=5)):
#print "saving: " + result.url
#try:
#save_image(result.url, "fig_" + str(i+1) + "_" + str(j+1))
#except:
#next

# the following searches for "fig N patent illustration"
search_base = " patent illustration"
for i, illustration in enumerate(invention.unformatted_illustrations):
    search_string = 'fig ' + str(i + 1) + search_base
    print "searching for: " + search_string
    for j, result in enumerate(
            engine.search(search_string, type=IMAGE, count=5)):
        print "saving: " + result.url
        try:
            save_image(result.url, "fig_" + str(i + 1) + "_" + str(j + 1))
        except:
            next
text = sys.stdin.read()
invention = machine.Invention(text)
engine = Bing(license=None)

#the following searches for patent illustrations on bing, using a generated noun from each description of the illustration
#search_base = "patent illustration "
#for i, illustration in enumerate(invention.unformatted_illustrations):
    #nouns = [word for word, pos in tag(illustration) if pos == 'NN']
    #if len(nouns) > 0:
        #search_string = search_base + random.choice(nouns)#' '.join(nouns) 
        #print "searching for: " + search_string
        #for j, result in enumerate(engine.search(search_string, type=IMAGE, count=5)):
            #print "saving: " + result.url
            #try:
                #save_image(result.url, "fig_" + str(i+1) + "_" + str(j+1))
            #except:
                #next

# the following searches for "fig N patent illustration"
search_base = " patent illustration"
for i, illustration in enumerate(invention.unformatted_illustrations):
    search_string = 'fig ' + str(i+1) + search_base
    print "searching for: " + search_string
    for j, result in enumerate(engine.search(search_string, type=IMAGE, count=5)):
        print "saving: " + result.url
        try:
            save_image(result.url, "fig_" + str(i+1) + "_" + str(j+1))
        except:
            next
예제 #14
0
        for result in engine.search(match, type=SEARCH, start=i):
            print plaintext(result.description)
            targetfile.write(plaintext(result.description))
            targetfile.write('\n')


def bing_search(match):
    engine = Bing
    for i in range(1, 10):
        for result in engine.search(match, type=SEARCH, start=i):
            print plaintext(result.description)


engine = Bing()  # Enter your license key.
for i in range(1, 15):
    for result in engine.search('holy', type=SEARCH, start=i):
        print plaintext(result.description)
        print
#google_search(u'شیر مادر', milkfile)
#google_search(u'شیر وحشی', lionfile)
#google_search(u'شیر آب', tapfile)
##article =  engine.search(match)
#print article.title
#for link in  article.links:
#    print link
#    #subarticle = engine.search(link)
#    url = URL(link)
#    result = Result(url)
#    print result.download()

#    print item