Exemplo n.º 1
0
def getMicroblog(idd, pages, opener, MIDs,batch=20):    
    #global db
    global sleepTime
    global randomTimes
    result = True
    try:
        for i in range(1, pages + 1):
            micros = 0
            TEnd = time.time()
            TBegin = [0.0]
            for j in range(0, 3):    
                try:    
                    #in fact, i don't know the details of this function
                    text = GetPage.tryGetPage(idd, i, j, opener)        
                except GetPage.FailureOverTimesException as e:
                    print(e)
                    continue
                except GetPage.LoadingFailingException as e:
                    print(e)
                    continue
                except GetPage.NetworkBusyException as e:
                    print(e)
                    continue 
        
                microblogs = text.split("div action-type=\"feed_list_item\" ")
                micros += len(microblogs)
                if(len(microblogs)==1 and isLastOne(microblogs[0])):
                    raise TimeLimitException
                for microblog in microblogs:
                    if(not store(idd,microblog,MIDs,TBegin)):          #store() will return a judgement for the microblog's birthday, then go on or end up
                        raise TimeLimitException
                time.sleep(randomTimes * random.random() + sleepTime)
            if(i==1):
                TBegin[0] /= 1000 
                days = (TEnd-TBegin[0])/(3600*24)+0.1
                # if microblog density is bigger than 1.5/day, return false. that is mean the microblog's comments won't be crawled
                if((micros/days)>0.5):   
                    result = False
                    print("post too frequently")
                    break            
    finally:
        print("end grab microblog", idd)
        return result
Exemplo n.º 2
0
def getPageNumber(idd, opener):
    global db  
    try: 
        #tryGetPage(idd,page,pagebar,opener)  why is pagebar here 3?? 
        text = GetPage.tryGetPage(idd, 1, 3, opener)
    except GetPage.NoMicroblogException:
        return 0
    
    
    #output(text)
    if text.find("微博列表") == -1:  
        return 1
    # # this should be repaired
    matches = re.search(r' (\d+) ', text)   
    if matches is None:
        return 0  
    
    n = int(matches.group(1))
    
    return n