Пример #1
0
def get_all_friends(user1):
    friends = []
    page = get_4sq_page('users', user1)
    if page == None:
        return friends
    user = json.loads(page)['response']['user']
    count = 0
    if 'checkins' in user:
        checkins = user['checkins']
        if 'count' in checkins:
            count = checkins['count']
    #print user, count
    firstName = user['firstName'] if 'firstName' in user else 'firstName'
    lastName = user['lastName'] if 'lastName' in user else 'lastName'
    homeCity = user['homeCity'] if 'homeCity' in user else 'homeCity'
    write_dat('user', [user1, firstName, lastName, homeCity.replace(' ','_'), count])
    for group in user['friends']['groups']:
        for item in group['items']:
            user2 = item['id'] if 'id' in item else 'user2'
            friends.append(item['id'])

            write_dat('friendship', [user1, user2])

            if 'contact' in item:
                contact = item['contact']
                if 'twitter' in contact:
                    for t_urls in get_4sq(contact['twitter']):
                        for t_url in t_urls:
                            #print t_url['expanded_url']
                            page = get_page(t_url['expanded_url'])
                            if page != None:
                                get_venue(user2, page)
    return friends
Пример #2
0
def crawl_web(tocrawl, keyword, n = 1000): # returns index, graph of inlinks
    crawled = set([])
    num_404 = 0
    total_size = 0
    min_score = -tocrawl[0][0]
    fout = open('result.txt','w+')
    start = time.clock()
    while tocrawl: 
        url = heappop(tocrawl) # changed page to url - clearer name
        filehandle = get_page(url[1])
        if filehandle == None:
            continue
        code = filehandle.code
        if code == 404:
            num_404 += 1
        if code == 401:
            continue
        if filehandle.headers.type != 'text/html':
            continue
        new_url = filehandle.geturl()
        if new_url not in crawled:
            #corpus.add_page(url, new_url, outlinks, tocrawl, count)
            #tocrawl += outlinks
            page = filehandle.read()
            outlinks, count = get_all_link_keyword(page, new_url, keyword)
            if count == 0:
                continue
            for outlink in outlinks:
                is_new_link = True
                for i in range(len(tocrawl)):
                    target = tocrawl[i]
                    if target[1] == outlink:
                        is_new_link = False
                        tocrawl.pop(i)
                        heappush(tocrawl, (target[0] - count, outlink))
                        break
                #if is_new_link and len(tocrawl) < n:
                if is_new_link:
                    if len(tocrawl) > n:
                        if count > min_score:
                            heappush(tocrawl, (-count, outlink))
                    else:
                        heappush(tocrawl, (-count, outlink))
                        if count < min_score:
                            min_score = count
            crawled.add(new_url)
            urllib.urlretrieve(new_url, os.path.join('downloads',str(n)+".html")) 
            n -= 1
            if n < 0:
                break
            size = len(page)
            total_size += size
            fout.write(new_url + ' time:' + str(time.clock()) + ' size:' + str(size) + ' return_code:' + str(code) + ' score:' + str(-url[0]) + ' actually:' + str(count) + '\n')

    fout.write('number_of_files:' + str(len(crawled)) + ' total_size:' + str(total_size) + ' total_time:' + str(time.clock() - start) + ' number_of_404_errors:' + str(num_404))
    fout.close()

    return crawled
Пример #3
0
 def crawl(self, seed):
     links = set(seed)
     crawled = []
     while len(crawled) < self.depth and links:
         url = links.pop()
         if url not in crawled:
             content = get_page(url)
             scrapped_links = self.__get_links(content, url)
             links.update(scrapped_links)
             crawled.append(url)
     return crawled
Пример #4
0
def crawl_web(seed,max_depth):
    tocrawl = [[seed, 0]]
    crawled = []
    myData = []
    while tocrawl:
        myData = tocrawl.pop()
        if myData[1] < max_depth + 1:
            union(tocrawl, get_all_links(getpage.get_page(myData[0]), myData[1]+1))
            if myData[0] not in crawled:          
                crawled.append(myData[0])            
       
    return crawled
Пример #5
0
def crawl_web(seed):
    tocrawl = set([seed])
    crawled = []
    corpus = WebCorpus()
    while tocrawl:
        id = tocrawl.pop()
        if id not in crawled:
            content = get_page(id)
            friends = get_all_friends(content)
            corpus.add_friend(id, friends)
            tocrawl.update(friends)
            crawled.append(id)
    return crawled
Пример #6
0
def crawl_web(seed):  # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    corpus = WebCorpus()
    while tocrawl:
        url = tocrawl.pop()  # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            outlinks = get_all_links(content)
            corpus.add_page(url, content, outlinks)
            tocrawl.update(outlinks)
            crawled.append(url)
    corpus.finish_crawl()
    return corpus
Пример #7
0
def crawl_web(seed):  # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    corpus = WebCorpus()
    while tocrawl:
        url = tocrawl.pop()  # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            outlinks = get_all_links(content)
            corpus.add_page(url, content, outlinks)
            tocrawl.update(outlinks)
            crawled.append(url)
    corpus.finish_crawl()
    return corpus
Пример #8
0
def crawl_web(seed):  # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    wcorpus = WebCorpus()
    while tocrawl:
        url = tocrawl.pop()  # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            add_page_to_index(wcorpus.index, url, content)
            outlinks = get_all_links(content)
            wcorpus.graph[url] = outlinks
            tocrawl.update(outlinks)
            crawled.append(url)
    return wcorpus
Пример #9
0
def crawl_web(seed): # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    wcorpus = WebCorpus()
    while tocrawl: 
        url = tocrawl.pop() # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            add_page_to_index(wcorpus.index, url, content)
            outlinks = get_all_links(content)
            wcorpus.graph[url] = outlinks
            tocrawl.update(outlinks)
            crawled.append(url)
    return wcorpus
Пример #10
0
def crawl_web(seed):  # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    graph = {}  # <url>, [list of pages it links to]
    index = {}
    while tocrawl:
        url = tocrawl.pop()  # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            add_page_to_index(index, url, content)
            outlinks = get_all_links(content)
            graph[url] = outlinks
            tocrawl.update(outlinks)
            crawled.append(url)
    return index, graph
Пример #11
0
def crawl_web(seed): # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    graph = {}  # <url>, [list of pages it links to]
    index = {} 
    while tocrawl: 
        url = tocrawl.pop() # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            add_page_to_index(index, url, content)
            outlinks = get_all_links(content)
            graph[url] = outlinks
            tocrawl.update(outlinks)
            crawled.append(url)
    return index, graph
Пример #12
0
def crawl_web(seed):  # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    wcorpus = WebCorpus()
    while tocrawl:
        url = tocrawl.pop()  # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            outlinks = get_all_links(content)
            for outlink in outlinks:
                wcorpus.add_link(url, outlink)
            for word in content.split():
                wcorpus.add_word_occurrence(url, word)
            tocrawl.update(outlinks)
            crawled.append(url)
    return wcorpus
Пример #13
0
def crawl_web(seed): # returns index, graph of inlinks
    tocrawl = set([seed])
    crawled = []
    wcorpus = WebCorpus()
    while tocrawl: 
        url = tocrawl.pop() # changed page to url - clearer name
        if url not in crawled:
            content = get_page(url)
            outlinks = get_all_links(content)
            for outlink in outlinks:
                wcorpus.add_link(url, outlink) 
            for word in content.split():
                wcorpus.add_word_occurrence(url, word)
            tocrawl.update(outlinks)
            crawled.append(url)
    return wcorpus
Пример #14
0
def crawlWeb(seed):
	toCrawl = set([seed])		#start with a seed page
	crawled = []			#keep a record of sites crawled to prevent repeat visits
	wcorpus = WebCorpus()
	while toCrawl:
		url = toCrawl.pop()					
		if url not in crawled:				#check whether already crawled
			content = get_page(url)		#read-in all of the page's html text
			outlinks = getAllLinks(content)  #store outlinks in var for building graph
			for outlink in outlinks:
				wcorpus.add_link(url, outlink)
			for word in content.split():
				wcorpus.add_word_occurrence(url, word)
			toCrawl.update(outlinks)		#add outlinks to toCrawl stack if we haven't cralwed already
			crawled.append(url)				#store page that we popped in crawled. 
	return wcorpus	
Пример #15
0
def crawl_web(seed): # returns webcorpus (includes index, graph)
    tocrawl = set([seed])
    crawled = []
    corpus = WebCorpus()

    while tocrawl: 
        url = tocrawl.pop()
        if url not in crawled:
            content = get_page(url)
            add_page_to_index(corpus, url, content)
            
            outlinks = get_all_links(content)
            for outlink in outlinks:
                corpus.add_link(url, outlink)

            tocrawl.update(outlinks)
            crawled.append(url)
    
    return corpus
Пример #16
0
def crawl_web(seed):  # returns webcorpus (includes index, graph)
    tocrawl = set([seed])
    crawled = []
    corpus = WebCorpus()

    while tocrawl:
        url = tocrawl.pop()
        if url not in crawled:
            content = get_page(url)
            add_page_to_index(corpus, url, content)

            outlinks = get_all_links(content)
            for outlink in outlinks:
                corpus.add_link(url, outlink)

            tocrawl.update(outlinks)
            crawled.append(url)

    return corpus
Пример #17
0
def get_all_friends(user1):
    friends = []
    page = get_4sq_page('users', user1)
    user = json.loads(page)['response']['user']
    count = 0
    if 'checkins' in user:
        checkins = user['checkins']
        if 'count' in checkins:
            count = checkins['count']
    #print user, count
    firstName = user['firstName'] if 'firstName' in user else 'firstName'
    lastName = user['lastName'] if 'lastName' in user else 'lastName'
    homeCity = user['homeCity'] if 'homeCity' in user else 'homeCity'
    write_dat('user', [user1, firstName, lastName, homeCity.replace(' ','_'), count])
    for group in user['friends']['groups']:
        for item in group['items']:
            user2 = item['id'] if 'id' in item else 'user2'
            friends.append(item['id'])

            write_dat('friendship', [user1, user2])

            if 'contact' in item:
                contact = item['contact']
                if 'twitter' in contact:
                    try:
                        for t_urls in get_4sq(contact['twitter']):
                            for t_url in t_urls:
                                #print t_url['expanded_url']
                                get_venue(user2, get_page(t_url['expanded_url']))
                    except tweepy.TweepError:
                        time.sleep(60 * 2)
                        continue
                    except StopIteration:
                        break
                    except:
                        pass
    return friends
Пример #18
0
def crawl_web(tocrawl, keyword, n = 1000): # returns index, graph of inlinks
    crawled = set([])
    url_finder = {}
    num_404 = 0
    total_size = 0
    fout = open('result.txt','w+')
    start = time.clock()
    while tocrawl: 
        url = heappop(tocrawl) # changed page to url - clearer name
        filehandle = get_page(url[1])
        if filehandle == None:
            continue
        code = filehandle.code
        if code == 404:
            num_404 += 1
        if code == 401:
            continue
        if filehandle.headers.type != 'text/html':
            continue
        new_url = filehandle.geturl()
        if new_url not in crawled:
            #corpus.add_page(url, new_url, outlinks, tocrawl, count)
            #tocrawl += outlinks
            page = filehandle.read()
            outlinks, count = get_all_link_keyword(page, new_url, keyword)
            if count == 0:
                continue
            len_tocrawl = len(tocrawl)
            for outlink in outlinks:
                is_new_link = True
                min_index = 0
                max_index = len_tocrawl - 1
                if outlink in url_finder:
                    key_score = url_finder[outlink]
                    while True:
                        if max_index < min_index:
                            break
                        m = (max_index + min_index)/2
                        target = tocrawl[m]
                        if target[0] < key_score:
                            min_index = m + 1
                        elif target[0] > key_score:
                            max_index = m - 1
                        else:
                            for j in range(min_index, max_index):
                                target = tocrawl[j]
                                if target[1] == outlink:
                                    is_new_link = False
                                    tocrawl.pop(j)
                                    url_score = target[0] - count
                                    heappush(tocrawl, (url_score, outlink))
                                    url_finder[outlink] = url_score
                                    break
                            break
#                for i in range(len_tocrawl):
#                    target = tocrawl[i]
#                    if target[1] == outlink:
#                        is_new_link = False
#                        tocrawl.pop(i)
#                        heappush(tocrawl, (target[0] - count, outlink))
#                        break
                #if is_new_link and len(tocrawl) < n:
                else:
                    heappush(tocrawl, (-count, outlink))
                    url_finder[outlink] = -count
            tocrawl = nsmallest(n, tocrawl)
            crawled.add(new_url)
            urllib.urlretrieve(new_url, os.path.join('downloads',str(n)+".html")) 
            n -= 1
            if n < 0:
                break
            size = len(page)
            total_size += size
            fout.write(new_url + ' time:' + str(time.clock()) + ' size:' + str(size) + ' return_code:' + str(code) + ' score:' + str(-url[0]) + ' actually:' + str(count) + '\n')

    fout.write('number_of_files:' + str(len(crawled)) + ' total_size:' + str(total_size) + ' total_time:' + str(time.clock() - start) + ' number_of_404_errors:' + str(num_404))
    fout.close()

    return crawled