예제 #1
0
class BackgroundDataCollection:

    def __init__(self):
        self.data_source_file = None
        self.utilities = Utilities()

    def set_data_source_file(self, source_file):
        self.data_source_file = source_file

    def remove_out_of_range_historic_urls(self, urls, date_from, date_to):
        """
        Remove out of range urls

        :param urls: list of urls
        :param date_from: date from
        :param date_to: date to
        :return: list of in the range urls
        """
        in_range_urls = []

        try:
            date_from = parser.parse(str(date_from))
            date_to = parser.parse(str(date_to))
        except ValueError:
            raise Exception("Invalid date range. Please input date in yyyymmdd format")

        for url in urls:
            if len(url) > 43:
                date_str = url[28:42]

                url_time = parser.parse(date_str)

                if date_from <= url_time <= date_to:
                    in_range_urls.append(url)

        return in_range_urls

    def collect_data(self, date_from, date_to):
        """
        Run the whole workflow for historical article collection within a range

        :param date_from: date from
        :param date_to: date to
        :return: list of articles
        """

        # os.makedirs(self.articles_base_dir, exist_ok=True)
        try:
            parser.parse(str(date_from))
            parser.parse(str(date_to))
        except ValueError:
            print("Invalid date format. Please provide date in yyyymmdd format.")
            return

        source_urls = self.utilities.read_lines_from_file(self.data_source_file)
        new_file_count = 0
        for source_url in source_urls:

            url_str = str(subprocess.run(
                ['waybackpack', source_url, '--list', '--from-date', str(date_from), '--to-date', str(date_to)],
                stdout=subprocess.PIPE).stdout.decode('utf-8'))
            urls = url_str.splitlines()

            print(urls)
            exit()
예제 #2
0
class EventExtractor:
    def __init__(self):
        self.data_file = app_config['data_file']
        self.texts_in_file = 'texts_in_file.txt'
        self.ner_texts_file = 'output.txt'
        self.utilities = Utilities()
        self.lemmatizer = WordNetLemmatizer()
        self.preprocessor = Preprocessor(
            ['remove_urls', 'remove_mentions', 'remove_hashtags', 'normalize'])

        # jar_files = os.path.join(os.path.dirname(__file__), 'jars')
        # self.sutime = SUTime(jars=jar_files, mark_time_ranges=True)

    def save_texts_in_file(self):
        items = self.utilities.read_from_csv(self.data_file)

        header = items[0]

        texts = [item[header.index('text')] for item in items[1:]]

        processed_texts = [
            self.preprocessor.preprocess(text).encode('utf8') for text in texts
        ]

        self.utilities.save_list_as_text_file(processed_texts,
                                              self.texts_in_file)

    def prepare_phrases(self,
                        matches,
                        tag,
                        token_position=0,
                        tag_position=-1,
                        splitter='/'):
        phrases = []
        phrase = ''
        for match in matches:
            match_components = match.split(splitter)
            text_token = match_components[token_position].lower().strip()
            event_tag = match_components[tag_position]

            if event_tag == 'B-' + tag and len(phrase) < 1:
                phrase += text_token
            elif event_tag == 'B-' + tag and len(phrase) > 0:
                phrases.append(phrase)
                phrase = text_token
            else:
                phrase += ' ' + text_token
        phrases.append(phrase)
        phrases = list(set(phrases))

        return phrases

    def get_event_phrases(self, text):
        tag_name = 'EVENT'
        matches = re.findall(r'\w+/O/[A-Z]+/[BI]-' + tag_name, text)
        phrases = self.prepare_phrases(matches, tag_name)
        joined_text = ', '.join(phrases) if len(phrases) > 0 else ''

        return joined_text

    def get_event_locations(self, text):
        tag_name = 'geo-loc'
        matches = re.findall(r'\w+/[BI]-' + tag_name + '/[A-Z]+/O', text)
        phrases = self.prepare_phrases(matches=matches,
                                       tag=tag_name,
                                       token_position=0,
                                       tag_position=1)
        joined_text = ', '.join(phrases) if len(phrases) > 0 else ''

        return joined_text

    def get_event_entities(self, text):
        tag_names = [
            'person', 'company', 'facility', 'product', 'band', 'sportsteam',
            'movie', 'tv-show'
        ]
        phrases = []
        for tag_name in tag_names:
            matches = re.findall(r'\w+/[BI]-' + tag_name + '/[A-Z]+/O', text)
            if len(matches) > 0:
                phrases += self.prepare_phrases(matches=matches,
                                                tag=tag_name,
                                                token_position=0,
                                                tag_position=1)

        joined_text = ', '.join(phrases) if len(phrases) > 0 else ''

        return joined_text

    def extract_events(self):
        data_rows = self.utilities.read_from_csv(self.data_file)
        text_rows = self.utilities.read_lines_from_file(self.ner_texts_file)

        header = data_rows[0]
        del data_rows[0]
        events = []
        unique_texts = []
        for data_row, text_row in zip(data_rows, text_rows):
            text = self.preprocessor.preprocess(data_row[header.index('text')])

            if text in unique_texts:
                continue

            event = {
                'tweet_id': data_row[header.index('id')],
                'entities': self.get_event_entities(text_row),
                'locations': self.get_event_locations(text_row),
                'event_time': data_row[header.index('created_at')],
                'event_phrases': self.get_event_phrases(text_row),
            }

            events.append(event)
            unique_texts.append(text)

        return events

    def extract_events_from_stanford_dependencies(self, dependencies,
                                                  ner_tags):
        entity_categories = ['PERSON', 'LOCATION', 'ORGANIZATION']
        raw_events = {}
        for dependency in dependencies:
            if len(dependency) == 3:
                head = dependency[0]
                relation = dependency[1]
                tail = dependency[2]

                if head[1].startswith('VB'):
                    event_keywords = list(raw_events.keys())
                    event_keyword = self.lemmatizer.lemmatize(
                        head[0].lower(), 'v')
                    if event_keyword not in event_keywords:
                        raw_events[event_keyword] = {}

                    if relation.endswith('subj'):
                        subject_pronoun = [
                            'i', 'you', 'he', 'she', 'we', 'they', 'who'
                        ]
                        subj_value = self.lemmatizer.lemmatize(tail[0].lower())

                        if tail[0].lower() in subject_pronoun:
                            subj_value = 'PERSON'
                        else:
                            for ner_tag in ner_tags:
                                if ner_tag[0] == tail[0] and ner_tag[
                                        1] in entity_categories:
                                    subj_value = ner_tag[1]
                        raw_events[event_keyword]['subj'] = subj_value

                    if relation == 'dobj':
                        objective_pronoun = [
                            'me', 'you', 'him', 'her', 'us', 'you', 'them'
                        ]
                        dobj_value = self.lemmatizer.lemmatize(tail[0].lower())

                        if tail[0].lower() in objective_pronoun:
                            dobj_value = 'PERSON'
                        else:
                            for ner_tag in ner_tags:
                                if ner_tag[0] == tail[0] and ner_tag[
                                        1] in entity_categories:
                                    dobj_value = ner_tag[1]

                        raw_events[event_keyword]['dobj'] = dobj_value

                    if relation == 'compound:prt':
                        raw_events[event_keyword]['prt'] = tail[0]

        events = []
        for verb in list(raw_events.keys()):
            event = raw_events[verb]
            if len(verb) < 2 or 'subj' not in list(event.keys()) or len(event['subj']) < 2 \
                    or 'dobj' not in list(event.keys()) or len(event['dobj']) < 2:
                continue

            event['keyword'] = verb
            events.append(event)

        return events

    def get_unique_tweets(self, n_rows=None):
        data_rows = self.utilities.read_from_csv(self.data_file)
        preprocessor = Preprocessor([
            'remove_urls', 'remove_mentions', 'remove_hashtags', 'normalize',
            'remove_non_letters'
        ])

        header = data_rows[0]
        del data_rows[0]
        tweet_rows = {}
        for data_row in data_rows:
            if n_rows is not None and len(tweet_rows) >= n_rows:
                break
            tweet = preprocessor.preprocess(data_row[header.index('text')])
            if tweet not in list(tweet_rows.keys()):
                tweet_rows[tweet] = data_row
        tweet_rows = [header] + list(tweet_rows.values())

        return tweet_rows

    def get_tweet_sentences(self, tweet_rows):
        header = tweet_rows[0]
        del tweet_rows[0]

        tweet_sentences = []
        for tweet_row in tweet_rows:
            created_at = tweet_row[header.index('created_at')]
            text = self.preprocessor.preprocess(
                tweet_row[header.index('text')])
            sentences = sent_tokenize(text)
            for sentence in sentences:
                if len(sentence) > 1:
                    tweet_sentences.append((created_at, sentence))

        return tweet_sentences

    def extract_events2(self, tweet_sentences):
        path_to_jar = 'lib/stanford_parser/stanford-parser.jar'
        path_to_models_jar = 'lib/stanford_parser/stanford-english-corenlp-2018-02-27-models.jar'
        path_to_ner_tagger = 'lib/stanford_ner/stanford-ner.jar'
        path_to_ner_model = 'lib/stanford_ner/english.all.3class.distsim.crf.ser.gz'

        sentence_preprocessor = Preprocessor(['remove_non_letters'])
        ner_tagger = StanfordNERTagger(path_to_ner_model, path_to_ner_tagger)
        dependency_parser = StanfordDependencyParser(
            path_to_jar=path_to_jar, path_to_models_jar=path_to_models_jar)

        events = []

        chunks = list(
            self.utilities.chunkify_list(data_list=tweet_sentences,
                                         items_per_chunk=1000))

        for chunk in chunks:
            created_ats = []
            sentences = []
            for chunk_item in chunk:
                created_ats.append(chunk_item[0])
                sentences.append(
                    sentence_preprocessor.preprocess(chunk_item[1]))

            chunk_sent_dependencies = dependency_parser.raw_parse_sents(
                sentences)
            chunk_sent_ner_tags = ner_tagger.tag_sents(
                [sentence.split() for sentence in sentences])

            for sent_dependencies, sent_ner_tags, created_at in zip(
                    chunk_sent_dependencies, chunk_sent_ner_tags, created_ats):
                dependencies = [
                    list(parse.triples()) for parse in sent_dependencies
                ]

                if len(dependencies) > 0 and dependencies[0] is not None:
                    sentence_events = self.extract_events_from_stanford_dependencies(
                        dependencies[0], sent_ner_tags)
                    if len(sentence_events) > 0:
                        for sentence_event in sentence_events:
                            events.append((created_at, sentence_event))

        return events

    def chunkify_events_by_timeslots(self, events, duration):
        slot_starts_at = None
        event_chunks = []
        event_chunk = []
        for event in events:
            created_at = datetime.strptime(event[0], '%d-%m-%Y %H:%M')

            if slot_starts_at is None:
                slot_starts_at = created_at

            if len(event_chunk
                   ) > 0 and created_at > slot_starts_at + timedelta(
                       0, duration):
                event_chunks.append(event_chunk)
                event_chunk = []
                slot_starts_at = created_at
            event_chunk.append(event)
        event_chunks.append(event_chunk)
        return event_chunks