def run(self,q):# q is the query urlsBing=[] engine = Bing(license=None) # Enter your license key. for i in range(1,11): for result in engine.search(q, type=SEARCH, start=i): urlsBing.append(result.url) return urlsBing
def get_urls(self, q = "", n = 1, limit = 1): url = [] reload(sys) sys.setdefaultencoding(GOOGLE_API_ENCODING) engine_google = Bing(license=BING_API_KEY, language=BING_API_LANG) for i in range(1, (n + 1)): for result in engine_google.search(q, start=i, count=10, type=SEARCH, cached=False): url.append(result.url) return url[:limit]
def get_bing_entries(self, search, nb): bing = Bing(language=self.dico_lang[self.language]) entries = [] for result in bing.search(search, start=1, count=nb, cached=False): entry_input = Input(result.text) annotations = { 'source' : 'Bing', 'title': result.title, 'url': result.url, 'search' : search, } entry_input.segments[0].annotations.update(annotations) entries.append(entry_input) return entries
def get_bing_entries(self, search, nb): bing = Bing(language=self.dico_lang[self.language]) entries = list() for result in bing.search(search, start=1, count=nb, cached=False): entry_input = Input(result.text) annotations = { 'source': 'Bing', 'title': result.title, 'url': result.url, 'search': search, } segment = entry_input[0] segment.annotations.update(annotations) entry_input[0] = segment entries.append(entry_input) return entries
def bingcorpsearch(word,concfilter = '', extraquery='',license=None, start=1, count=50): """Searches the web for sentences containing a certain keyword, and possibly a co-occurence word. Generator yielding (leftcontext,word,rightcontext,url) tuples. First queries Google, and then retrieves the pages of the top search results. Uses 'pattern' (CLiPS, Antwerpen University) """ if not concfilter: query = word else: query = word + ' ' + concfilter if extraquery: query += ' ' + extraquery engine = Bing(license=license) processed = {} for result in engine.search(query, start=start,count=count): if not result.url in processed: processed[result.url] = True try: content = plaintext(result.download()) except: continue begin = 0 wordindex = None wordlength = 0 concindex = None for i in range(1,len(content)): if content[i] == '.' or content[i] == '?' or content[i] == '!' or content[i] == '\n': if wordindex >= begin and ((concfilter and concindex >= begin) or (not concfilter)): if len(content[begin:wordindex].strip()) > 5 or len(content[wordindex+wordlength:i+1].strip()) > 5: yield (content[begin:wordindex].strip(), content[wordindex:wordindex+wordlength].strip(), content[wordindex+wordlength:i+1], result.url) wordindex = concindex = None begin = i + 1 if len(word)+i <= len(content) and content[i:i+len(word)].lower() == word.lower(): wordindex = i wordlength = len(word) for j in range(len(word),len(content)): if i+j < len(content) and (content[i+j] == ' ' or content[i+j] == '?' or content[i+j] == '!' or content[i+j] == '\n'): wordlength = j break if concfilter and content[i:len(concfilter)].lower() == concfilter.lower(): concindex = i
def novelty(word): """ Returns the novelty of the given word as a value 0.0-1.0 (1.0 = 100% novel). """ engine = Bing() # Google(license="...") # Get the number of search results that mention the given word. # http://www.clips.ua.ac.be/pages/pattern-web#services count = engine.search(word, cached=True).total # Note: we should cached=False to get the most up-to-date count. # It would be nice if this number was relative (0.0-1.0), # then we could represent novelty as a percentage, # based on the number of existing web pages that mention the word. # Here are some raw numbers: # - "and" : 1730000000 # - "new york" : 94700000 # - "tree" : 78200000 # - "justin bieber" : 7680000 # - "computational creativity" : 5330000 # - "zombification" : 126000 # - "zombification machine" : 37000 # - "zombology" : 11100 # - "zombeliever" : 11 # - "zombriefing" : 0 # - "zombifractor" : 0 # So, it looks like common words are mentioned thousands of times, # while invented words are mentioned dozens of times. # We'll cut off the result count above 100 # (= anything mentioned 100x times on the net is not novel). count = min(count, 100) # And then relativize the value: count = 1.0 - count * 0.01 return count
from pattern.web import Bing, IMAGE import urllib engine = Bing() for counter, result in enumerate(engine.search('meme', type=IMAGE)): try: urllib.urlretrieve(result.url, "images/%s.jpg" % counter) except: pass
# "X IS MORE IMPORTANT THAN Y" # Here is a rough example of how to build a web miner. # It mines comparative statements from Bing and stores the results in a table, # which can be saved as a text file for further processing later on. # Pattern matching also works with Sentence objects from the MBSP module. # MBSP's parser is much more robust (but also slower). #from MBSP import Sentence, parse q = '"more important than"' # Bing search query p = "NP (VP) more important than NP" # Search pattern. p = Pattern.fromstring(p) d = Datasheet() engine = Bing(license=None) for i in range(1): # max=10 for result in engine.search(q, start=i+1, count=100, cached=True): s = result.description s = plaintext(s) s = Sentence(parse(s)) for m in p.search(s): a = m.constituents(constraint=0)[-1] # Left NP. b = m.constituents(constraint=5)[ 0] # Right NP. d.append(( a.string.lower(), b.string.lower())) pprint(d) print print len(d), "results."
engine = Google(license=None) for i in range(1,10): for result in engine.search(match, type=SEARCH, start=i): print plaintext(result.description) targetfile.write(plaintext(result.description)) targetfile.write('\n') def bing_search(match): engine = Bing for i in range(1,10): for result in engine.search(match, type=SEARCH, start=i): print plaintext(result.description) engine = Bing() # Enter your license key. for i in range(1,15): for result in engine.search('holy', type=SEARCH, start=i): print plaintext(result.description) print #google_search(u'شیر مادر', milkfile) #google_search(u'شیر وحشی', lionfile) #google_search(u'شیر آب', tapfile) ##article = engine.search(match) #print article.title #for link in article.links: # print link # #subarticle = engine.search(link) # url = URL(link) # result = Result(url) # print result.download() # print item
def bingcorpsearch(word, concfilter='', extraquery='', license=None, start=1, count=50): """Searches the web for sentences containing a certain keyword, and possibly a co-occurence word. Generator yielding (leftcontext,word,rightcontext,url) tuples. First queries Google, and then retrieves the pages of the top search results. Uses 'pattern' (CLiPS, Antwerpen University) """ if not concfilter: query = word else: query = word + ' ' + concfilter if extraquery: query += ' ' + extraquery engine = Bing(license=license) processed = {} for result in engine.search(query, start=start, count=count): if not result.url in processed: processed[result.url] = True try: content = plaintext(result.download()) except: continue begin = 0 wordindex = None wordlength = 0 concindex = None for i in range(1, len(content)): if content[i] == '.' or content[i] == '?' or content[ i] == '!' or content[i] == '\n': if wordindex >= begin and ( (concfilter and concindex >= begin) or (not concfilter)): if len(content[begin:wordindex].strip()) > 5 or len( content[wordindex + wordlength:i + 1].strip()) > 5: yield (content[begin:wordindex].strip(), content[wordindex:wordindex + wordlength].strip(), content[wordindex + wordlength:i + 1], result.url) wordindex = concindex = None begin = i + 1 if len(word) + i <= len(content) and content[i:i + len(word)].lower( ) == word.lower(): wordindex = i wordlength = len(word) for j in range(len(word), len(content)): if i + j < len(content) and ( content[i + j] == ' ' or content[i + j] == '?' or content[i + j] == '!' or content[i + j] == '\n'): wordlength = j break if concfilter and content[i:len(concfilter)].lower( ) == concfilter.lower(): concindex = i
text = sys.stdin.read() invention = machine.Invention(text) engine = Bing(license=None) #the following searches for patent illustrations on bing, using a generated noun from each description of the illustration #search_base = "patent illustration " #for i, illustration in enumerate(invention.unformatted_illustrations): #nouns = [word for word, pos in tag(illustration) if pos == 'NN'] #if len(nouns) > 0: #search_string = search_base + random.choice(nouns)#' '.join(nouns) #print "searching for: " + search_string #for j, result in enumerate(engine.search(search_string, type=IMAGE, count=5)): #print "saving: " + result.url #try: #save_image(result.url, "fig_" + str(i+1) + "_" + str(j+1)) #except: #next # the following searches for "fig N patent illustration" search_base = " patent illustration" for i, illustration in enumerate(invention.unformatted_illustrations): search_string = 'fig ' + str(i + 1) + search_base print "searching for: " + search_string for j, result in enumerate( engine.search(search_string, type=IMAGE, count=5)): print "saving: " + result.url try: save_image(result.url, "fig_" + str(i + 1) + "_" + str(j + 1)) except: next
text = sys.stdin.read() invention = machine.Invention(text) engine = Bing(license=None) #the following searches for patent illustrations on bing, using a generated noun from each description of the illustration #search_base = "patent illustration " #for i, illustration in enumerate(invention.unformatted_illustrations): #nouns = [word for word, pos in tag(illustration) if pos == 'NN'] #if len(nouns) > 0: #search_string = search_base + random.choice(nouns)#' '.join(nouns) #print "searching for: " + search_string #for j, result in enumerate(engine.search(search_string, type=IMAGE, count=5)): #print "saving: " + result.url #try: #save_image(result.url, "fig_" + str(i+1) + "_" + str(j+1)) #except: #next # the following searches for "fig N patent illustration" search_base = " patent illustration" for i, illustration in enumerate(invention.unformatted_illustrations): search_string = 'fig ' + str(i+1) + search_base print "searching for: " + search_string for j, result in enumerate(engine.search(search_string, type=IMAGE, count=5)): print "saving: " + result.url try: save_image(result.url, "fig_" + str(i+1) + "_" + str(j+1)) except: next
for result in engine.search(match, type=SEARCH, start=i): print plaintext(result.description) targetfile.write(plaintext(result.description)) targetfile.write('\n') def bing_search(match): engine = Bing for i in range(1, 10): for result in engine.search(match, type=SEARCH, start=i): print plaintext(result.description) engine = Bing() # Enter your license key. for i in range(1, 15): for result in engine.search('holy', type=SEARCH, start=i): print plaintext(result.description) print #google_search(u'شیر مادر', milkfile) #google_search(u'شیر وحشی', lionfile) #google_search(u'شیر آب', tapfile) ##article = engine.search(match) #print article.title #for link in article.links: # print link # #subarticle = engine.search(link) # url = URL(link) # result = Result(url) # print result.download() # print item