def run(): user_agents = ldUserAgents("./UserAgentString.json") bookinfos = loadMatrixFromFile("./kaijuannodump.csv") # bookinfos = loadMatrixFromFile(sys.argv[1]) # newbookinfos = [] sellers = ["亚马逊", "京东", "当当", "北发", "淘书", "博库", "文轩", "中国图书", "China-pub"] for index in range(1, len(bookinfos)): if isAready(bookinfos[index][0]): print "INFO: Already processed" continue print "INFO: processing", bookinfos[index][0], b = BookAPI(bookinfos[index][0], choice(user_agents)) JsonData = json.loads(b.api()) if JsonData["error"]: appendstr2file(JsonData["isbn"], "error.log") print "ERROR: got an network error " sleep(1) continue tmpLst = bookinfos[index] if JsonData["total"] == 0: for seller in sellers: tmpLst.append("None") print "WARNING: no data" sleep(1) continue for seller in sellers: tmpLst.append("None") for key in JsonData["data"]: if seller.decode("UTF-8") in key: del tmpLst[-1] tmpLst.append(JsonData["data"][key]) break # newbookinfos.append(tmpLst) appendlst2file(lstUtf8(tmpLst), "newkaijuan.csv") appendstr2file(bookinfos[index][0], "visited.csv") print "Done" sleep(1)
#!/usr/bin/env python # # Author: Archer Reilly # Date: 15/Sep/2015 # File: dedumplicate.py # Desc: remove the dumplicate records from kaijuan.csv # # Produced By BR(BeautifulReading) from util import loadMatrixFromFile, saveMatrixToFile mat = loadMatrixFromFile('./kaijuan.csv') newmat = [] tmpLst = [] for row in mat: if row[0] in tmpLst: print 'INFO: dumplicate ', row[0] else: newmat.append(row) tmpLst.append(row[0]) saveMatrixToFile('kaijuannodump.csv', newmat)