def parseLogPage(html): qs = QuoraScraper() print("Parsing html") parsed = pq(html) entries = [] for i in parsed.find('.QuestionLog')[0].getchildren()[:-1]: top, bottom = i.getchildren()[0].getchildren() if 'AddQuestionRedirectOperationView' in top.attrib['class']: text = dehtml(''.join( map(tostring, top.getchildren()[0].getchildren()[:2]))) user = top.getchildren()[0].getchildren()[3].attrib['href'] actionType = "MERGED" else: text = None if len(top) > 1: text = dehtml(tostring(top[1])) actionType = top.getchildren()[0].text userElm = top.getchildren()[0].getchildren() if len(userElm) > 1: user = top.getchildren()[0].getchildren()[1].attrib['href'] else: user = None revision, date = bottom.text_content().split(u' \xe2\x80\xa2 ') entry = { "date": qs.processDate(date), "revision": revision, "user": user, "actionType": actionType, "text": text } entries.append(entry) return entries
def reparse(fn): '''Reparses question from stored HTML''' with open(fn) as f: entry = json.load(f) html_data = GzipFile(fileobj=StringIO(a2b_hex(entry['html']))).read() reparsed = QuoraScraper.getQuestion(html_data) entry['data'] = reparsed with open(fn, 'w') as f: json.dump(entry, f)
# Empty string must be used here # localhost/hostname cause clients to be unable to connect HOST = '' PORT = args.PORT logging.info("Starting server on {}:{}".format(HOST, PORT)) server = socketserver.TCPServer((HOST, PORT), ScrapeServer) ### ADD GLOBAL DATA TO SERVER INSTANCE ### # Seen links server.directory = {} # Queue server.urls_to_use = set() # Seeder server.urlGen = QuoraScraper.getQuestionPage() # Load previously scraped links if os.path.isfile(DIRECTORY_FILE): with open(DIRECTORY_FILE) as f: data = f.read().strip().split('\n') server.directory = {} for entry in data: entry = json.loads(entry) key = entry.keys()[0] server.directory[key] = entry[key] else: f = open(DIRECTORY_FILE, 'w') f.close() # Load previously saved seed links
import binascii import urllib2 import os import selenium.common basePath = '/export/a04/wpovell/' if 'SCRAPE_LOG_BASEPATH' in os.environ: basePath = os.environ['SCRAPE_LOG_BASEPATH'] with open(os.path.join(basePath, 'scrapedUrls.txt')) as f: inp = set(f.read().strip().split('\n')) with open(os.path.join(basePath, 'rescrapedUrls.txt')) as f: done = set(f.read().strip().split('\n')) doneF = open(os.path.join(basePath, 'rescrapedUrls.txt'), 'a') qs = QuoraScraper() for line in list(inp - done): try: ret = qs.processLog(line) except Exception: print("BAD: {}".format(line)) continue out = StringIO() try: with gzip.GzipFile(fileobj=out, mode="w") as f: f.write(ret['html'].encode('utf-8')) except TypeError: print("BAD: {}".format(line)) continue compressed_html = out.getvalue()
type=int, default=7, nargs=1, help="how long to wait between requets") args = parser.parse_args() HOST = args.HOST PORT = args.PORT # Directory to write output files to if args.output is None: OUTPUT_DIRECTORY = "data" else: OUTPUT_DIRECTORY = args.output[0] scraper = QuoraScraper(wait=args.wait) logging.info("Connecting to {} on port {}".format(HOST, PORT)) try: # Send empty request to get first job url = getUrl(EMPTY_REQUEST) while True: # Logging time ts = datetime.now().strftime('%Y/%m/%d %H:%M:%S') logging.info("[{}] URL = {}".format(ts, url)) # Output file time t = int(time()) # Scrape and process data html, log, data = None, None, None try:
if args.m: toGetTimes = open(args.m, 'w') files = getFiles(INPUT_DIR) files = binFiles(files) for fileHash, fileList in files.items(): print(fileHash) outPath = '{}/{}/{}'.format(OUTPUT_DIR, fileHash[0], fileHash[1]) if not os.path.isdir(outPath): os.makedirs(outPath) tarf = tarfile.open('{}/{}.tar.gz'.format(outPath, fileHash[2]), 'w:gz') for fn, fullHash in fileList: with open(fn) as f: data = json.load(f) if 'data' not in data: continue t = data['time'] html = data['html'] html = a2b_hex(html) strFile = StringIO(html) html = GzipFile(fileobj=strFile).read() info = QuoraScraper.getQuestion(html, t) data['data'] = info if args.m and not "log" in data: toGetTimes.write(fn + '\n') createEntry(fullHash, data, tarf) tarf.close()
import urllib2 from Quora.QuoraScraper import QuoraScraper if __name__ == '__main__': import argparse parser = argparse.ArgumentParser(description='Complete previously scraped data.') parser.add_argument('EMAIL', help='Quora account email.') parser.add_argument('PASS', help='Quora account password.') parser.add_argument('-r', '--read', default="fileInfo.txt", help='File to read data info from.') parser.add_argument('-o', '--output', default="completeData.txt", help="File to write results to.") parser.add_argument('-s', '--seen', default="seenData.txt", help="File to write seen files to.") args = parser.parse_args() s_nl = QuoraScraper() s = QuoraScraper(True, args.EMAIL, args.PASS) f = open(args.read) o = open(args.output, 'a') with open(args.seen) as seenF: data = set(seenF.read().split('\n')) seen = open(args.seen, 'a') c = 0 try: for line in f.readlines(): fn, url, hasData, hasTime = json.loads(line) if fn in data or not hasData: continue if not hasTime: