def main(filename, outfilename, count, stop_after): sample = [] checker = LanguageChecker('italian') hashtag = HashtagExtractor() print "Extracting a sample of %d tweets. Stopping after %d tweets" % ( count, stop_after) with gzip.open(filename, 'r') as input: for idx, line in enumerate(input): jobj = json.loads(line) # Check that the tweet is actually italian if not checker.is_valid(jobj['text']): continue if not hashtag.extract(jobj): continue if len(sample) < count: sample.append(jobj) else: r = random.randint(0, idx) if r < count: sample[r] = jobj if idx >= stop_after and len(sample) >= count: break with gzip.open(outfilename, 'w') as output: for jobj in sample: output.write(json.dumps(jobj) + "\n")
def main(filename, outfilename, count, stop_after): sample = [] checker = LanguageChecker('italian') hashtag = HashtagExtractor() print "Extracting a sample of %d tweets. Stopping after %d tweets" % (count, stop_after) with gzip.open(filename, 'r') as input: for idx, line in enumerate(input): jobj = json.loads(line) # Check that the tweet is actually italian if not checker.is_valid(jobj['text']): continue if not hashtag.extract(jobj): continue if len(sample) < count: sample.append(jobj) else: r = random.randint(0, idx) if r < count: sample[r] = jobj if idx >= stop_after and len(sample) >= count: break with gzip.open(outfilename, 'w') as output: for jobj in sample: output.write(json.dumps(jobj) + "\n")
def __init__(self, input_file, output_file, rho_log, ht_log): self.lang = LanguageChecker('italian') self.hashtag = HashtagExtractor() self.annotator = AnnotationExtractor() self.italian = 0 self.annotated = 0 self.total = 0 self.requests = 0 self.coht = 0 self.rho_warn = 0 self.rho_log = gzip.open(rho_log, 'w') self.ht_log = gzip.open(ht_log, 'w') self.input_file = input_file self.output_file = output_file
class StreamMonitor(object): def __init__(self, username, password, output): self.auth = (username, password) self.output = output self.checker = LanguageChecker('italian') def run(self): r = requests.get(SAMPLE_URL, auth=self.auth, prefetch=False) with gzip.open(self.output, 'a') as output: for line in r.iter_lines(): if not line: continue tweet = json.loads(line) if 'text' in tweet and \ tweet['user']['lang'] == 'it' and \ self.checker.is_valid(tweet['text']) and \ len(tweet['entities']['hashtags']) > 1: output.write(line + "\n") print tweet['text']
class StreamMonitor(object): def __init__(self, username, password, output): self.auth = (username, password) self.output = output self.checker = LanguageChecker("italian") def run(self): r = requests.get(SAMPLE_URL, auth=self.auth, prefetch=False) with gzip.open(self.output, "a") as output: for line in r.iter_lines(): if not line: continue tweet = json.loads(line) if ( "text" in tweet and tweet["user"]["lang"] == "it" and self.checker.is_valid(tweet["text"]) and len(tweet["entities"]["hashtags"]) > 1 ): output.write(line + "\n") print tweet["text"]
def __init__(self, username, password, output): self.auth = (username, password) self.output = output self.checker = LanguageChecker("italian")
def __init__(self, username, password, output): self.auth = (username, password) self.output = output self.checker = LanguageChecker('italian')
class Annotator(object): def __init__(self, input_file, output_file, rho_log, ht_log): self.lang = LanguageChecker('italian') self.hashtag = HashtagExtractor() self.annotator = AnnotationExtractor() self.italian = 0 self.annotated = 0 self.total = 0 self.requests = 0 self.coht = 0 self.rho_warn = 0 self.rho_log = gzip.open(rho_log, 'w') self.ht_log = gzip.open(ht_log, 'w') self.input_file = input_file self.output_file = output_file def run(self, skip_lang=False): with gzip.open(self.output_file, 'w') as output: with gzip.open(self.input_file, 'r') as f: for line in f: json = loads(line) unstripped = json['text'] tweet_id = json['id_str'] text = self.hashtag.sanitize(unstripped) # Skip non italian tweets self.total += 1 if not skip_lang and not self.lang.is_valid(text): continue self.italian += 1 hts = self.hashtag.extract(json) # Skip text without hashtags if not hts: continue buff = self.annotate(tweet_id, unstripped, text, hts) if buff: output.write(buff) if self.annotated % 1000 == 0: sys.stderr.write("%d annotated of %d requested of %d italians of %d processed [%d warning, %d co-ht]\r" % (self.annotated, self.requests, self.italian, self.total, self.rho_warn, self.coht)) sys.stderr.flush() sys.stderr.write("%d annotated of %d requested of %d italians of %d processed [%d warning, %d co-ht]\n" % (self.annotated, self.requests, self.italian, self.total, self.rho_warn, self.coht)) sys.stderr.flush() def annotate(self, tweet_id, unstripped, text, hts): self.requests += 1 annotations = self.annotator.annotate(text) if not annotations: return "" payload = { "hts": hts, "annotations": annotations, "id": tweet_id, "tweet": text } self.annotated += 1 buff = json.dumps(payload) + '\n' for annotation in annotations: if annotation[1] == 0.5: self.rho_log.write(buff) self.rho_warn += 1 break if len(hts) >= 2: self.ht_log.write(json.dumps(hts) + '\n') self.coht += 1 return buff
class Annotator(object): def __init__(self, input_file, output_file, rho_log, ht_log): self.lang = LanguageChecker('italian') self.hashtag = HashtagExtractor() self.annotator = AnnotationExtractor() self.italian = 0 self.annotated = 0 self.total = 0 self.requests = 0 self.coht = 0 self.rho_warn = 0 self.rho_log = gzip.open(rho_log, 'w') self.ht_log = gzip.open(ht_log, 'w') self.input_file = input_file self.output_file = output_file def run(self, skip_lang=False): with gzip.open(self.output_file, 'w') as output: with gzip.open(self.input_file, 'r') as f: for line in f: json = loads(line) unstripped = json['text'] tweet_id = json['id_str'] text = self.hashtag.sanitize(unstripped) # Skip non italian tweets self.total += 1 if not skip_lang and not self.lang.is_valid(text): continue self.italian += 1 hts = self.hashtag.extract(json) # Skip text without hashtags if not hts: continue buff = self.annotate(tweet_id, unstripped, text, hts) if buff: output.write(buff) if self.annotated % 1000 == 0: sys.stderr.write( "%d annotated of %d requested of %d italians of %d processed [%d warning, %d co-ht]\r" % (self.annotated, self.requests, self.italian, self.total, self.rho_warn, self.coht)) sys.stderr.flush() sys.stderr.write( "%d annotated of %d requested of %d italians of %d processed [%d warning, %d co-ht]\n" % (self.annotated, self.requests, self.italian, self.total, self.rho_warn, self.coht)) sys.stderr.flush() def annotate(self, tweet_id, unstripped, text, hts): self.requests += 1 annotations = self.annotator.annotate(text) if not annotations: return "" payload = { "hts": hts, "annotations": annotations, "id": tweet_id, "tweet": text } self.annotated += 1 buff = json.dumps(payload) + '\n' for annotation in annotations: if annotation[1] == 0.5: self.rho_log.write(buff) self.rho_warn += 1 break if len(hts) >= 2: self.ht_log.write(json.dumps(hts) + '\n') self.coht += 1 return buff