Exemplo n.º 1
0
    def __init__(self):
        conf = get_config()
        aws_access_key_id = conf["amazon"]["access_tokens"]["access_key_id"]
        aws_secret_access_key = conf["amazon"]["access_tokens"][
            "secret_access_key"]

        sts_client = boto3.client('sts',
                                  aws_access_key_id=aws_access_key_id,
                                  aws_secret_access_key=aws_secret_access_key)

        # Call the assume_role method of the STSConnection object and pass the role
        # ARN and a role session name.
        assumed_role_object = sts_client.assume_role(
            RoleArn="arn:aws:iam::835348665944:role/dynamo_db_full_access",
            RoleSessionName="AssumeRoleSession1")

        # From the response that contains the assumed role, get the temporary
        # credentials that can be used to make subsequent API calls
        credentials = assumed_role_object['Credentials']

        # Use the temporary credentials that AssumeRole returns to make a
        # connection to Amazon S3
        self.dynamodb = boto3.resource(
            'dynamodb',
            aws_access_key_id=credentials['AccessKeyId'],
            aws_secret_access_key=credentials['SecretAccessKey'],
            aws_session_token=credentials['SessionToken'],
            region_name='eu-west-1')

        # self.dynamodb = boto3.resource('dynamodb',
        #                                aws_session_token=aws_session_token,
        #                                aws_access_key_id=aws_access_key_id,
        #                                aws_secret_access_key=aws_secret_access_key,
        #                                region_name=region)

        self.tweet_table = self.dynamodb.Table('TweetSecond')
        self.already_created_tweet_ids = list()

        self.graph = nx.DiGraph()
        self.fetcher = Fetcher()
Exemplo n.º 2
0
def start(cfg):
    access_token = cfg['dropbox']['access_token']
    input_files = cfg['input']
    output_folder = cfg['output']
    fetcher = Fetcher(access_token, output_folder)

    for input_file in input_files:
        print("input file: %s" % input_file)
        fetcher.load_input_urls(input_file)

    mkdirs(output_folder)
    print("output folder: %s" % output_folder)

    fetcher.add_filter_rule(filter_by_mat)
    print("test filter:")
    if cfg['filter'] and cfg['filter']['test_names']:
        test_names = cfg['filter']['test_names']
        for test_name in test_names:
            print("[ %-5r ]: %s" % (fetcher.test_filter_rule(test_name), test_name))

    fetcher.fetch_all()
    return fetcher
Exemplo n.º 3
0
class GraphBuilder(object):
    def __init__(self):
        conf = get_config()
        aws_access_key_id = conf["amazon"]["access_tokens"]["access_key_id"]
        aws_secret_access_key = conf["amazon"]["access_tokens"][
            "secret_access_key"]

        sts_client = boto3.client('sts',
                                  aws_access_key_id=aws_access_key_id,
                                  aws_secret_access_key=aws_secret_access_key)

        # Call the assume_role method of the STSConnection object and pass the role
        # ARN and a role session name.
        assumed_role_object = sts_client.assume_role(
            RoleArn="arn:aws:iam::835348665944:role/dynamo_db_full_access",
            RoleSessionName="AssumeRoleSession1")

        # From the response that contains the assumed role, get the temporary
        # credentials that can be used to make subsequent API calls
        credentials = assumed_role_object['Credentials']

        # Use the temporary credentials that AssumeRole returns to make a
        # connection to Amazon S3
        self.dynamodb = boto3.resource(
            'dynamodb',
            aws_access_key_id=credentials['AccessKeyId'],
            aws_secret_access_key=credentials['SecretAccessKey'],
            aws_session_token=credentials['SessionToken'],
            region_name='eu-west-1')

        # self.dynamodb = boto3.resource('dynamodb',
        #                                aws_session_token=aws_session_token,
        #                                aws_access_key_id=aws_access_key_id,
        #                                aws_secret_access_key=aws_secret_access_key,
        #                                region_name=region)

        self.tweet_table = self.dynamodb.Table('TweetSecond')
        self.already_created_tweet_ids = list()

        self.graph = nx.DiGraph()
        self.fetcher = Fetcher()

    def _yield_tweets(self, batches=None):
        response = self.tweet_table.scan(
            FilterExpression=Attr('retweet_count').gt(10000))
        data = response['Items']
        first = True
        batch_no = 1
        while 'LastEvaluatedKey' in response:
            response = self.tweet_table.scan(
                ExclusiveStartKey=response['LastEvaluatedKey'],
                FilterExpression=Attr('retweet_count').gt(10000))
            batch_no += 1

            if batches and batch_no >= batches:
                raise StopIteration

            if first:
                data.extend(response['Items'])
                first = False
                yield data
            else:
                yield response["Items"]

    def _get_popular_tweets(self):
        main_dict_length = 500
        tmp_dict_length = 1000
        tweets_by_retweet_no = dict()
        temporary_dict = dict()
        for batch in self._yield_tweets(batches=100):
            for tweet in batch:
                if self.check_if_tweet_has_correct_date(tweet):
                    yield tweet

    def check_if_tweet_has_correct_date(self, tweet):
        date_str = tweet["created_at"]
        date_array = date_str.split(" ")
        try:
            day_of_the_week = date_array[0]
            month = date_array[1]
            day = date_array[2]
            hour = date_array[3]
            year = date_array[5]

            if year == '2019' and month == 'May':
                return True
            print("Returned false tweet from: {}".format(date_str))
            return False
        except IndexError:
            print("IndexError!!")
            return False

    def save_to_file(self, output_file_path):
        nx.write_gexf(self.graph, output_file_path)

    def _tweet_to_edges(self, tweet, parameter='favourites'):
        original_tweet_owner = tweet["retweeted_status"]["user"]["screen_name"] +\
                               tweet["retweeted_status"]["user"]["id_str"]
        original_tweet_id = tweet["retweeted_status"]["id_str"]
        if original_tweet_id in self.already_created_tweet_ids:
            print("Tweet with id = {}, of user = {} already appended".format(
                original_tweet_id, original_tweet_owner))
            return

        if parameter == 'favourites':
            user_list = self.fetcher.get_users_that_like_tweet(
                original_tweet_id)
        elif parameter == 'retweets':
            user_list = self.fetcher.get_users_that_retweet_tweet(
                original_tweet_id)
        else:
            raise RuntimeError(
                "Incorrect parameter. Parameter should be one of {}".format(
                    str(["favourites", "retweets"])))

        self.already_created_tweet_ids.append(original_tweet_id)
        for user_name in user_list:
            if self.graph.has_edge(original_tweet_owner, user_name):
                # we added this one before, just increase the weight by one
                self.graph[original_tweet_owner][user_name]['weight'] += 1
            else:
                # new edge. add with weight=1
                self.graph.add_edge(original_tweet_owner, user_name, weight=1)

    def build(self, parameter="favourites"):
        # for tweet_batch in self._yield_tweets():
        #     for tweet in tweet_batch:
        #         self._tweet_to_edges(tweet, parameter)
        twitter_calls = 0
        for tweet in self._get_popular_tweets():
            try:
                self._tweet_to_edges(tweet, parameter)
                twitter_calls += 1
            except Exception as e:
                try:
                    print(str(e))
                    code = e.args[0][0]["code"]
                    if code == 88:
                        print("Twitter was called {} times".format(
                            twitter_calls))
                        return
                    pass
                except (IndexError, TypeError):
                    pass
Exemplo n.º 4
0
#!/usr/bin/python3

import json
import sys
import random

from fetcher.fetcher import Fetcher, Feed
from curator.curator import Curator
from summarizer.summarizer import Summarizer

if __name__ == '__main__':
    print("Initializing...")
    fetcher = Fetcher()
    curator = Curator()
    summarizer = Summarizer()

    # Fetch all articles from rss feeds

    print("\nFetching articles...")
    articles = fetcher.simple_fetch(cached=True)
    print("Articles found: %d" % len(articles))

    # Curation testing

    print("\nCurating articles...")
    curated_articles = curator.curate(articles, False)
    print("Curation complete.")
    print("-- Clusters found: --")
    for group in curated_articles:
        articles = group["articles"]
        for article in articles:
Exemplo n.º 5
0
from flask import Flask, jsonify, request, make_response
from werkzeug.exceptions import HTTPException

from fetcher.fetcher import Fetcher
from curator.curator import Curator
from summarizer.summarizer import Summarizer

import json
import sys
import os.path

import traceback
import time

# initialise all the stuff
fetcher = Fetcher()
curator = Curator()
summarizer = Summarizer()

app = Flask(__name__)


# flask routes
@app.route('/')
def index():
    return "Hello, Home!"


@app.errorhandler(404)
def not_found(error):
    print('[ERROR] Offending request: ', request)
Exemplo n.º 6
0
def lambda_handler(event, context):
    since_date = "2019-01-01"
    fetcher = Fetcher()
    dynamodb = boto3.resource('dynamodb')
    table = dynamodb.Table('TweetSecond')

    queries = get_config(config_file_path="queries/queries.yml")
    screen_names = queries["users"]
    tags = queries["tags"]

    # shuffle lists
    random.shuffle(tags)
    random.shuffle(screen_names)

    fetched_tweets = []
    unique_ids = []

    for screen_name in screen_names:
        latest_user_tweet_id = None
        try:
            result = table.query(
                IndexName="user_screen_name-id_str-index",
                KeyConditionExpression=Key('user_screen_name').eq(screen_name),
                ScanIndexForward=False,
                Limit=1)
            latest_user_tweet_id = int(result["Items"][0]["id_str"])
        except Exception as e:
            print(e)

        print("Latest user tweet id: " + str(latest_user_tweet_id))

        currently_fetched_tweets = fetcher.get_user_timeline_tweets(
            screen_name=screen_name, since_id=latest_user_tweet_id)
        for tweet in currently_fetched_tweets:
            tweet_dict = tweet._json
            tweet_dict["user_screen_name"] = screen_name
            tweet_dict["hashtag"] = "None"
            tweet_dict = json.loads(json.dumps(tweet_dict),
                                    parse_float=Decimal)
            tweet_dict = remove_nones(tweet_dict)

            if tweet_dict["id_str"] not in unique_ids:
                fetched_tweets.append(tweet_dict)
                unique_ids.append(tweet_dict["id_str"])

    for tag in tags:
        results = []
        latest_tag_tweet_id = None
        try:
            result = table.query(IndexName="hashtag-id_str-index",
                                 KeyConditionExpression=Key('hashtag').eq(tag),
                                 ScanIndexForward=False,
                                 Limit=1)
            latest_tag_tweet_id = int(result["Items"][0]["id_str"])
        except Exception as e:
            print(e)
        if results:
            print("DynamoDB query results: " + str(results))
        print("Latest user tweet id: " + str(latest_tag_tweet_id))

        currently_fetched_tweets = fetcher.get_tweets_by_term(
            term=tag, since_id=latest_tag_tweet_id, since=since_date)
        for tweet in currently_fetched_tweets:
            tweet_dict = tweet._json
            tweet_dict["user_screen_name"] = "None"
            tweet_dict["hashtag"] = tag
            tweet_dict = json.loads(json.dumps(tweet_dict),
                                    parse_float=Decimal)
            tweet_dict = remove_nones(tweet_dict)

            if tweet_dict["id_str"] not in unique_ids:
                fetched_tweets.append(tweet_dict)
                unique_ids.append(tweet_dict["id_str"])

    print("Writing to DynamoDb")
    # write to DynamoDB
    with table.batch_writer() as batch:
        for tweet in fetched_tweets:
            batch.put_item(Item=tweet)
    print("Finished writing to DynamoDb")
Exemplo n.º 7
0
                        help="Query to summarise by")
    parser.add_argument('--summarize',
                        action='store_true',
                        help='Outputs summary of given article')
    parser.add_argument(
        '-c',
        '--cluster',
        action='store_true',
        help='On: group by clusters; Off (default): group by topics')
    parser.add_argument('--curate',
                        action='store_true',
                        help='Outputs list of curated articles')
    args = parser.parse_args()

    try:
        fetcher = Fetcher()
        if args.curate:
            curator = Curator()

            articles = fetcher.simple_fetch()
            result = curator.curate(articles, args.cluster)

            with open('curated_articles.out', 'w') as output:
                output.writelines(json.dumps(result, sort_keys=True, indent=4))

            print(json.dumps(result, sort_keys=True))
            sys.stdout.flush()
        elif args.summarize:
            t00 = time.time()
            t0 = time.time()
            print("[{}] starting ".format(time.time() - t0), file=sys.stderr)
Exemplo n.º 8
0
        tfidf_matrix = tfidf_vectorizer.fit_transform(corpus)
        #print(tfidf_matrix.toarray())

        # apply kmeans clustering for classfication
        n_clusters = round(len(corpus)**0.5 * 0.5)
        #n_clusters = 50
        #print("Generating {} clusters".format(n_clusters))
        clustering_model = KMeans(  # create k-means model with custom config
            n_clusters=n_clusters,
            max_iter=300,
            precompute_distances="auto",
            n_jobs=-1)

        document_labels = clustering_model.fit_predict(
            tfidf_matrix)  # array of cluster label by document

        # evaluate the clustering quality -1 is bad, 0 is overlap, 1 is good
        print(silhouette_score(tfidf_matrix, labels=document_labels))
        #utils.visualize_tfidf_matrix(tfidf_matrix)
        return document_labels


if __name__ == '__main__':
    from fetcher.fetcher import Fetcher
    fetcher = Fetcher()
    curator = Curator()
    articles = fetcher.simple_fetch()
    print(articles)
    #print(curator.cluster(articles))
    print(curator.curate(articles, False))