Пример #1
0
class Fill:
    def __init__(self):
        self.keywords = self.set_keywords()
        self.es = Elastic()

    def set_keywords(self):
        df = pd.read_excel('input/twitter_supported_languages.xlsx')
        df = df[df['implemented'] == True].set_index(
            'language_code')['floods_filtered']
        keywords = {}
        for language, words in df.iteritems():
            keywords[language] = set(
                [word.strip().lower() for word in words.split(',')])
        return keywords

    def generate_tweets(self, fp, start=datetime(1970, 1, 1)):
        with open(fp, 'rb') as f:
            for tweet in f.readlines():
                tweet = tweet.strip()
                try:
                    tweet = json.loads(tweet)
                except json.decoder.JSONDecodeError:
                    continue
                try:
                    language = tweet['lang']
                except KeyError:
                    continue
                clean_text = sanitize.clean_text(tweet['text'], lower=False)
                clean_text_lower = clean_text.lower()
                try:
                    if not any(keyword in clean_text_lower
                               for keyword in self.keywords[language]):
                        continue
                except KeyError:
                    continue
                yield tweet

    def prepare_doc(self, json_doc):
        doc2es = tweet_parser(json_doc)
        doc2es['_index'] = DOCUMENT_INDEX
        doc2es['_id'] = doc2es['id']
        doc2es['_type'] = '_doc'
        doc2es['source']['type'] = 'tweet'
        return doc2es

    def prepare_docs(self, docs):
        for doc in docs:
            doc2es = self.prepare_doc(doc)
            if doc2es:
                yield doc2es

    def commit_docs(self, docs):
        self.es.bulk_operation(docs)

    def __call__(self, fp):
        tweets = self.generate_tweets(fp)
        tweets = self.prepare_docs(tweets)
        self.commit_docs(tweets)
Пример #2
0
def classify():
    es = Elastic()

    classify_per = 10_000

    if refresh:
        remove_field_from_index(DOCUMENT_INDEX, 'event_related')

    predictor = Predictor()

    query = {
        'query': {
            "bool": {
                "must": [
                    {
                        'exists': {
                            'field': 'locations'
                        }
                    }
                ],
                "must_not": {
                    'exists': {
                        'field': 'event_related'
                    }
                }
            }
        }
    }
    n = es.n_hits(index=DOCUMENT_INDEX, body=query)
    tweets = es.scroll_through(index=DOCUMENT_INDEX, body=query)
    tweet_subset = []
    for i, tweet in enumerate(tweets):
        if not i % classify_per:
            print(f"{i}/{n} ({int(i/n*100)}%) - {datetime.now()}")
        tweet_subset.append(tweet)

        if len(tweet_subset) == classify_per:
            IDs = []
            examples = []
            for tweet in tweet_subset:
                tweet = tweet['_source']
                IDs.append(tweet['id'])
                example = {
                    "id": tweet['id'],
                    "sentence1": clean_text(tweet['text'], lower=False),
                    "label": 0
                }
                examples.append(example)

            labels = predictor(examples)
            es_update = []
            for ID, label in zip(IDs, labels):
                es_update.append({
                    'doc': {
                        'event_related': True if label == 'yes' else False
                    },
                    '_index': DOCUMENT_INDEX,
                    '_id': ID,
                    '_op_type': 'update',
                })

            es.bulk_operation(es_update)

            tweet_subset = []