class Fill: def __init__(self): self.keywords = self.set_keywords() self.es = Elastic() def set_keywords(self): df = pd.read_excel('input/twitter_supported_languages.xlsx') df = df[df['implemented'] == True].set_index( 'language_code')['floods_filtered'] keywords = {} for language, words in df.iteritems(): keywords[language] = set( [word.strip().lower() for word in words.split(',')]) return keywords def generate_tweets(self, fp, start=datetime(1970, 1, 1)): with open(fp, 'rb') as f: for tweet in f.readlines(): tweet = tweet.strip() try: tweet = json.loads(tweet) except json.decoder.JSONDecodeError: continue try: language = tweet['lang'] except KeyError: continue clean_text = sanitize.clean_text(tweet['text'], lower=False) clean_text_lower = clean_text.lower() try: if not any(keyword in clean_text_lower for keyword in self.keywords[language]): continue except KeyError: continue yield tweet def prepare_doc(self, json_doc): doc2es = tweet_parser(json_doc) doc2es['_index'] = DOCUMENT_INDEX doc2es['_id'] = doc2es['id'] doc2es['_type'] = '_doc' doc2es['source']['type'] = 'tweet' return doc2es def prepare_docs(self, docs): for doc in docs: doc2es = self.prepare_doc(doc) if doc2es: yield doc2es def commit_docs(self, docs): self.es.bulk_operation(docs) def __call__(self, fp): tweets = self.generate_tweets(fp) tweets = self.prepare_docs(tweets) self.commit_docs(tweets)
def classify(): es = Elastic() classify_per = 10_000 if refresh: remove_field_from_index(DOCUMENT_INDEX, 'event_related') predictor = Predictor() query = { 'query': { "bool": { "must": [ { 'exists': { 'field': 'locations' } } ], "must_not": { 'exists': { 'field': 'event_related' } } } } } n = es.n_hits(index=DOCUMENT_INDEX, body=query) tweets = es.scroll_through(index=DOCUMENT_INDEX, body=query) tweet_subset = [] for i, tweet in enumerate(tweets): if not i % classify_per: print(f"{i}/{n} ({int(i/n*100)}%) - {datetime.now()}") tweet_subset.append(tweet) if len(tweet_subset) == classify_per: IDs = [] examples = [] for tweet in tweet_subset: tweet = tweet['_source'] IDs.append(tweet['id']) example = { "id": tweet['id'], "sentence1": clean_text(tweet['text'], lower=False), "label": 0 } examples.append(example) labels = predictor(examples) es_update = [] for ID, label in zip(IDs, labels): es_update.append({ 'doc': { 'event_related': True if label == 'yes' else False }, '_index': DOCUMENT_INDEX, '_id': ID, '_op_type': 'update', }) es.bulk_operation(es_update) tweet_subset = []