def getInRange(inputfile):
    f=open(inputfile)
    ###
    # stores all IDs as a list, elements as int
    IDpool=[]    
    while 1:
        line=f.readline()
        line=str(line).strip('\n')
        if line: IDpool.append(int(line))
        else: break
    if len(IDpool)==0:
        print 'Error! Inputfile is empty!'
        raise
    ###
    for i in IDpool:
        print(i)
        driver = webdriver.Firefox()
        try:
            time.sleep(random.randint(3, 4))
            profile_str = dianping_u_profile_crawler.get_page(i, driver).getstr()
            if not profile_str.find("\"Year\": 1990") == -1:
                raise
            out = codecs.open("./Data/%s_profile.txt"%str(i), 'w', 'utf-8')
            out.write(profile_str + "\n")
            out.close()
            ###
            # threshold
            file_threshold = open("./Data/%s_profile.txt"%str(i))
            content_json = json.loads(file_threshold.read())
            file_threshold.close()
            if content_json['Checkin'] <= 0 or content_json['Review'] <= 0:
                raise
            ###
            print("... processing an active user ...")
            time.sleep(random.randint(3, 4))
            out = codecs.open("./Data/%s_follows.txt"%str(i), 'w', 'utf-8')
            out.write(dianping_u_follows_crawler.get_follows(i, driver).getstr() + "\n")
            out.close()
            time.sleep(random.randint(3, 4))
            out = codecs.open("./Data/%s_fans.txt"%str(i), 'w', 'utf-8')
            out.write(dianping_u_fans_crawler.get_fans(i, driver).getstr() + "\n")
            out.close()
            time.sleep(random.randint(3, 4))
            out = codecs.open("./Data/%s_checkins.txt"%str(i), 'w', 'utf-8')
            out.write(dianping_u_checkins_crawler.get_checkins(i, driver) + "\n")
            out.close()
            time.sleep(random.randint(3, 4))
            out = codecs.open("./Data/%s_reviews.txt"%str(i), 'w', 'utf-8')
            out.write(dianping_u_reviews_crawler.get_reviews(i, driver) + "\n")
            out.close()
            driver.close()
        except:
            driver.close()
            continue
def getInRange(first, last, step):
    try:
        driver = webdriver.Firefox()
    except:
        print 'driver failed'
    for i in range(first, last+1, step):
        print(i)
        try:
            time.sleep(random.randint(3, 4))
            profile_str = dianping_u_profile_crawler.get_page(i, driver).getstr()
            out = open("./status.txt", 'w')
            out.write(str(i) + "\n")
            out.close()
            out = codecs.open("./Data/%s_profile.txt"%str(i), 'w', 'utf-8')
            out.write(profile_str + "\n")
            out.close()
            ###
            # threshold
            file_threshold = open("./Data/%s_profile.txt"%str(i))
            content_json = json.loads(file_threshold.read())
            file_threshold.close()
            if content_json['Checkin'] <= 0 or content_json['Review'] <= 0:
                continue
            ###
            print("... processing an active user ...")
            time.sleep(random.randint(3, 4))
            out = codecs.open("./Data/%s_follows.txt"%str(i), 'w', 'utf-8')
            out.write(dianping_u_follows_crawler.get_follows(i, driver).getstr() + "\n")
            out.close()
            time.sleep(random.randint(3, 4))
            out = codecs.open("./Data/%s_fans.txt"%str(i), 'w', 'utf-8')
            out.write(dianping_u_fans_crawler.get_fans(i, driver).getstr() + "\n")
            out.close()
            time.sleep(random.randint(3, 4))
            out = codecs.open("./Data/%s_checkins.txt"%str(i), 'w', 'utf-8')
            out.write(dianping_u_checkins_crawler.get_checkins(i, driver) + "\n")
            out.close()
            time.sleep(random.randint(3, 4))
            out = codecs.open("./Data/%s_reviews.txt"%str(i), 'w', 'utf-8')
            out.write(dianping_u_reviews_crawler.get_reviews(i, driver) + "\n")
            out.close()
        except:
            driver.close()
            raise
    driver.close()
def getInRange(first, last, step):
    driver = webdriver.Firefox()
    for i in range(first, last+1, step):
        print(i)
        try:
            time.sleep(random.randint(3, 4))
            profile_str = dianping_u_profile_crawler.get_page(i, driver).getstr()
            if not profile_str.find("\"Year\": 1990") == -1:
                out = open("./status.txt", 'w')
                out.write(str(i) + "\n")
                out.close()
                continue
            out = codecs.open("./Data/%s_profile.txt"%str(i), 'w', 'utf-8')
            out.write(profile_str + "\n")
            out.close()
            time.sleep(random.randint(3, 4))
            out = codecs.open("./Data/%s_follows.txt"%str(i), 'w', 'utf-8')
            out.write(dianping_u_follows_crawler.get_follows(i, driver).getstr() + "\n")
            out.close()
            time.sleep(random.randint(3, 4))
            out = codecs.open("./Data/%s_fans.txt"%str(i), 'w', 'utf-8')
            out.write(dianping_u_fans_crawler.get_fans(i, driver).getstr() + "\n")
            out.close()
            time.sleep(random.randint(3, 4))
            out = codecs.open("./Data/%s_checkins.txt"%str(i), 'w', 'utf-8')
            out.write(dianping_u_checkins_crawler.get_checkins(i, driver) + "\n")
            out.close()
            time.sleep(random.randint(3, 4))
            out = codecs.open("./Data/%s_reviews.txt"%str(i), 'w', 'utf-8')
            out.write(dianping_u_reviews_crawler.get_reviews(i, driver) + "\n")
            out.close()
            out = open("./status.txt", 'w')
            out.write(str(i))
            out.close()
        except:
            driver.close()
            raise
    driver.close()
Exemplo n.º 4
0
def getInRange(inputfile, current_id):
    switch = 0
    f = open(inputfile)
    ###
    # stores all IDs as a list, elements as int
    IDpool = []
    while True:
        line = f.readline()
        line = str(line).strip('\n')
        if line:
            IDpool.append(int(line))
        else:
            break
    if len(IDpool) == 0:
        print 'Error! Inputfile is empty!'
        raise Exception
    ###
    ID_number = len(IDpool)
    ID_count = 0
    binary = FirefoxBinary('/opt/firefox46/firefox')
    #driver = webdriver.Firefox(firefox_binary = binary)
    driver = webdriver.Firefox(executable_path='/root/geckodriver/geckodriver',
                               firefox_binary=binary)
    for i in IDpool:
        ID_count += 1
        print('ID:' + str(ID_count) + '/' + str(ID_number)) + '    ' + str(i)
        if switch == 0 and i != current_id:
            continue
        if switch == 0 and i == current_id:
            switch = 1
        if switch == 1:
            pass
        f = open('./status.txt', 'w')
        f.write(str(i))
        f.close()
        try:
            time.sleep(random.randint(3, 4))
            profile_str = dianping_u_profile_crawler.get_page(i,
                                                              driver).getstr()
            ###
            out = codecs.open("./Data/%s_profile.txt" % str(i), 'w', 'utf-8')
            out.write(profile_str + "\n")
            out.close()
            ###
            # threshold
            file_threshold = open("./Data/%s_profile.txt" % str(i))
            content_json = json.loads(file_threshold.read())
            file_threshold.close()
            if content_json['Checkin'] < 5 or content_json['Review'] < 5:
                continue
            ###
            print("... processing an active user ...")
            time.sleep(random.randint(3, 4))
            out = codecs.open("./Data/%s_follows.txt" % str(i), 'w', 'utf-8')
            out.write(
                dianping_u_follows_crawler.get_follows(
                    i, driver, content_json['Follows']).getstr() + "\n")
            out.close()
            time.sleep(random.randint(3, 4))
            out = codecs.open("./Data/%s_fans.txt" % str(i), 'w', 'utf-8')
            out.write(
                dianping_u_fans_crawler.get_fans(i, driver).getstr() + "\n")
            out.close()
            time.sleep(random.randint(3, 4))
            out = codecs.open("./Data/%s_checkins.txt" % str(i), 'w', 'utf-8')
            out.write(
                dianping_u_checkins_crawler.get_checkins(
                    i, driver, content_json['Checkin']) + '\n')
            out.close()
            time.sleep(random.randint(3, 4))
            out = codecs.open("./Data/%s_reviews.txt" % str(i), 'w', 'utf-8')
            out.write(dianping_u_reviews_crawler.get_reviews(i, driver) + "\n")
            out.close()

        except Exception as e:
            print e
            error_log = open('./error_log.txt', 'a')
            error_log.write(str(time.ctime()) + '\n')
            error_log.write(str(i) + ' ' + 'main  ' + '\n')
            error_log.write(str(e))
            error_log.write('\n\n')
            error_log.close()