Exemplo n.º 1
0
 def __init__(self):
     self.__aData = DataAggregator()
     self.__carousel = None
     self.__header = None
     self.__cart = None
     self.__hangarCameraLocation = None
     return
Exemplo n.º 2
0
    def _execute_checkers(self):
        '''
        Executes the checkers for mis matched data.
        '''
        template_handler = TrueTemplateHandler(self.temp_name)

        mv = MissingValChecker(self.temp_name)
        da = DataAggregator(self.temp_name)
        self.df = mv.execute(self.df, template_handler)
        new_df = da.execute(self.df, template_handler)
Exemplo n.º 3
0
def get_aggregate_data(q_out):
    all_data = retrieve_data(q_out)
    aggregator = DataAggregator()
    return aggregator.parse_data(all_data)
Exemplo n.º 4
0
 def setUp(self):
     self.mock_template = MockBTemplate("mock_b_template")
     self.da = DataAggregator("mock_b_template")
Exemplo n.º 5
0
class SummarizeNER(object):
    def __init__(self, df):
        self.data = df
        self.cleaned_data = self.get_cleaned_data()
        self.cleaned_phrases = self.get_ner_tags()

    def get_cleaned_data(self):
        return [self.clean(text) for text in self.data['text']]

    def get_summarized_data(self):
        wikidf = pd.DataFrame(columns=("NER", "Summary"))
        wikidf["NER"] = self.cleaned_phrases
        wikidf["Summary"] = self.get_wiki_summary()
        return wikidf

    def del_repeat(self, seq):
        seen = set()
        seen_add = seen.add
        return [x for x in seq if not (x in seen or seen_add(x))]

    def get_wiki_summary(self, sentences=4):
        wiki_summary = []

        for phrase, i in zip(self.cleaned_phrases,
                             range(len(self.cleaned_phrases))):
            print("Downloading ({}/{}) wikipedia page...".format(
                i + 1, len(self.cleaned_phrases)),
                  end="\r")
            try:
                summary = wikipedia.summary(phrase, sentences=sentences)
            except Exception as e:
                try:
                    a = str(e).splitlines()[1]
                    summary = wikipedia.summary(a, sentences=sentences)
                except:
                    summary = "No wikipedia page found"
                    pass
                pass

            wiki_summary.append(summary)

        return wiki_summary

    def clean(self, text, url=True, words_only=True, first_n_sent=(False, 4)):
        if url:
            text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*',
                          '', text)

        if words_only:
            regex = re.compile('[^a-zA-Z]')
            text = regex.sub(' ', text)

        if first_n_sent[0]:
            text = re.match(r'(?:[^.:;]+[.:;]){4}', text).group()

        return text

    def get_ner_tags(self):
        sys.path.append('../preprocess')
        from nltk.tag.stanford import StanfordNERTagger
        st = StanfordNERTagger(
            '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
            '../stanford-ner/stanford-ner.jar')

        tokenized_list = [ct.split() for ct in self.cleaned_data]
        NERTags = st.tag_sents(tokenized_list)

        n = []
        for nt in NERTags:
            n.extend(nt)

        ids = []
        #get the indexes of all words that have NER tags
        ids = [i for a, i in zip(n, range(len(n))) if a[1] != "O"]
        a = np.array(ids)

        consecutive_ids = np.split(a, np.where(np.diff(a) != 1)[0] + 1)

        phrases = []
        for ci in consecutive_ids:
            phrase = ""
            tag = ""
            for id_ in ci:
                phrase += "{} ".format(n[id_][0])

            tag += "{}".format(n[id_][1])
            phrases.append(phrase)

        cleaned_phrases = self.del_repeat(phrases)
        return cleaned_phrases

    if __name__ == '__main__':
        data_helper = DataAggregator()
        date_range = [date.today().strftime('%Y-%m-%d')]  # Only today.
        df = data_helper.get_data(date_range=date_range)

        sn = SummarizeNER(df)
        sd = sn.get_summarized_data()
        print(sd.endode("UTF-8"))
Exemplo n.º 6
0
    def top_mentions(self, X):
        h = []
        for tweet in self.tweets:
            user_name = tweet.user.screen_name
            usermentions = [
                usermentions.get('screen_name')
                for usermentions in tweet.entities.get("user_mentions")
            ]
            if len(usermentions) > 0:
                h.extend(usermentions)
            else:
                continue

        c = Counter(h)
        return c.most_common(X)


if __name__ == '__main__':
    data_helper = DataAggregator()
    date_range = [date.today().strftime('%Y-%m-%d')]  # Only today.
    df = data_helper.get_data(date_range=date_range)

    tweet_stats = TwitterStatistics(df)
    tdf = tweet_stats.get_data()

    try:
        print(tdf.to_string())
    except:
        sys.stdout.buffer.write(df.to_string().encode('utf-8'))
Exemplo n.º 7
0
class SummarizeNER(object):
    def __init__(self, df):
        self.data = df
        self.cleaned_data = self.get_cleaned_data()
        self.cleaned_phrases = self.get_ner_tags()

    def get_cleaned_data(self):
        return [self.clean(text) for text in self.data['text']]

    def get_summarized_data(self):
        self.data['NER'] = self.cleaned_phrases
        self.data['Wiki-NER-Sumarry'] = self.get_wiki_summary()
        return self.data

    def del_repeat(self, seq):
        seen = set()
        seen_add = seen.add
        return [x for x in seq if not (x in seen or seen_add(x))]

    def get_wiki_summary(self, sentences=4):
        wiki_summary = []

        for phrase, i in zip(self.cleaned_phrases,
                             range(len(self.cleaned_phrases))):
            if phrase != 'N/A':
                print("Downloading wikipedia pages...".format(
                    i + 1, len(self.cleaned_phrases)),
                      end="\r")
                try:
                    summary = wikipedia.summary(phrase[0], sentences=sentences)
                except Exception as e:
                    try:
                        a = str(e).splitlines()[1]
                        summary = wikipedia.summary(a, sentences=sentences)
                    except:
                        summary = "No wikipedia page found"
                        pass
                    pass
            else:
                summary = "No wikipedia page found"

            wiki_summary.append(summary)

        return wiki_summary

    def clean(self, text, url=True, words_only=True, first_n_sent=(False, 4)):
        if url:
            text = re.sub(r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*',
                          '', text)

        if words_only:
            regex = re.compile('[^a-zA-Z]')
            text = regex.sub(' ', text)

        if first_n_sent[0]:
            text = re.match(r'(?:[^.:;]+[.:;]){4}', text).group()

        return text

    def get_ner_tags(self):
        sys.path.append('../preprocess')
        from nltk.tag.stanford import StanfordNERTagger
        st = StanfordNERTagger(
            '../stanford-ner/classifiers/english.all.3class.distsim.crf.ser.gz',
            '../stanford-ner/stanford-ner.jar')

        tokenized_list = [ct.split() for ct in self.cleaned_data]
        NERTags = st.tag_sents(tokenized_list)

        tags = [nt for nt in NERTags]
        ids = [[i for a, i in zip(t, range(len(t))) if a[1] != "O"]
               for t in tags]

        phrases = []
        for i, t in zip(ids, tags):
            phrase = ""
            tt = "N/A"
            for p, index in zip(i, range(len(i))):
                if index == len(i) - 1:
                    phrase += "{}".format(t[p][0])
                    tt = phrase, t[p][1]
                else:
                    phrase += "{} ".format(t[p][0])

            phrases.append(tt)
        return phrases

    if __name__ == '__main__':
        data_helper = DataAggregator()
        date_range = [date.today().strftime('%Y-%m-%d')]  # Only today.
        df = data_helper.get_data(date_range=date_range)

        sn = SummarizeNER(df)
        sd = sn.get_summarized_data()
        print(sd.endode("UTF-8"))
Exemplo n.º 8
0
class GoogleDataEnhancer(object):
    def __init__(self, df):
        self.data = self.get_data(df)
        self.domains = self.get_domains()
        self.results = self.google_search()

    def get_data(self, df):
        a = df[df['source'] == "twitter"].index.tolist()
        tweet = [df["raw_data"][index].text for index in a]
        tweets = [self.clean(t) for t in tweet]

        a = df[df['source'] == "reddit"].index.tolist()
        subs = [df["raw_data"][index].title for index in a]

        data = tweets + subs
        return data

    def get_domains(self):
        with open("../domains.json", "r") as f:
            domains = json.load(f)

        return domains

    def clean(self, text):
        URLless_text = re.sub(
            r'\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*', '', text)
        regex = re.compile('[^a-zA-Z]')
        cleaned_text = regex.sub(' ', URLless_text)
        return cleaned_text

    def in_domain(self, url):

        for d in self.domains:
            for urls in self.domains[d]:
                if urls in url:
                    return d
                else:
                    continue
        return ""

    def google_search(self):
        results = []
        gd = GoogleDataHelper()
        print("* Google Searching Data...")
        for d, i in zip(self.data, range(len(self.data))):
            try:
                d = self.clean(d)
                print("* * Downloading ({}/{}) query".format(
                    i + 1, len(self.data)))
                r = gd.get_data(querystring=d)
            except Exception as e:
                print("* * cannot download query ({}) because: ({})".format(
                    i, str(e)))
                r = pd.DataFrame()
                continue

            results.append(r)
            sleep(5)  #minimum time to not look like a bot/script

        print("* Download complete! ")
        return results

    def enhance(self):

        df = pd.DataFrame(columns=(list(self.domains.keys())))
        df["data"] = self.data
        df['results'] = self.results

        for r, i in zip(self.results, range(len(self.results))):

            for d in self.domains:
                df[d][i] = []

            types = []
            type_dict = {}
            for url, text in zip(r['author'], r['text']):
                _type = self.in_domain(url)

                if _type != "":
                    t = (url, text)
                    df[_type][i].append(t)

        return df

    #def wiki_summarize(self):
    #import wikipedia
    #self.data

    if __name__ == '__main__':
        data_helper = DataAggregator()
        date_range = [date.today().strftime('%Y-%m-%d')]  # Only today.
        df = data_helper.get_data(date_range=date_range)

        gde = GoogleDataEnhancer(df)
        print(gde.enhance())