示例#1
0
def hashTags(id):
    tags = []
    export_uuid = uuid.uuid4()
    if not os.path.isdir(EXPORTS_BASEDIR):
        os.makedirs(EXPORTS_BASEDIR)
    q = models.TWITTER.query.filter(models.TWITTER.row_id == id).first()
    with open(
            os.path.join(EXPORTS_BASEDIR,
                         'hashtags_{}.txt'.format(export_uuid)), 'w+') as f:
        for filename in os.listdir(
                os.path.join(ARCHIVE_BASEDIR, q.targetType, q.title[0],
                             q.title)):
            try:
                if filename.endswith(".gz"):
                    for line in gzip.open(
                            os.path.join(ARCHIVE_BASEDIR, q.targetType,
                                         q.title[0], q.title, filename)):
                        tweet = json.loads(line.decode('utf-8'))
                        for tag in tweet['entities']['hashtags']:
                            tags.append(tag['text'].lower())
            except:
                continue
        counts = Counter(tags)
        for t in counts.most_common(10):
            f.write(str(t))
            f.write('\n')

    addExportRef = models.EXPORTS(url='hashtags_{}.txt'.format(export_uuid),
                                  type='Hashtags',
                                  exported=datetime.utcnow(),
                                  count=None)
    q.exports.append(addExportRef)
    db.session.commit()
    db.session.close()
示例#2
0
def dehydrateUserSearch(id):
    count = 0
    export_uuid = uuid.uuid4()
    if not os.path.isdir(EXPORTS_BASEDIR):
        os.makedirs(EXPORTS_BASEDIR)
    q = models.TWITTER.query.filter(models.TWITTER.row_id == id).first()
    with open(
            os.path.join(EXPORTS_BASEDIR,
                         'dehydrate_{}.txt'.format(export_uuid)), 'w+') as f:
        for filename in os.listdir(
                os.path.join(ARCHIVE_BASEDIR, q.targetType, q.title[0],
                             q.title)):
            if filename.endswith(".gz"):
                for line in gzip.open(
                        os.path.join(ARCHIVE_BASEDIR, q.targetType, q.title[0],
                                     q.title, filename)):
                    try:
                        count = count + 1
                        tweet = json.loads(line.decode('utf-8'))['id_str']
                        f.write(tweet)
                        f.write('\n')
                    except:
                        continue
    addExportRef = models.EXPORTS(url='dehydrate_{}.txt'.format(export_uuid),
                                  type='Dehydrate',
                                  exported=datetime.now(),
                                  count=count)
    q.exports.append(addExportRef)
    db.session.commit()
    db.session.close()
示例#3
0
def hashTagsCollection(id):
    tags = []
    export_uuid = uuid.uuid4()
    if not os.path.isdir(EXPORTS_BASEDIR):
        os.makedirs(EXPORTS_BASEDIR)
    q = models.COLLECTION.query.filter(models.COLLECTION.row_id == id).first()
    linkedTargets = models.COLLECTION.query. \
        filter(models.COLLECTION.row_id == id). \
        first(). \
        tags
    dbDateStart = q.inclDateStart
    dbDateStop = q.inclDateEnd

    with open(
            os.path.join(EXPORTS_BASEDIR,
                         'hashtags_{}.txt'.format(export_uuid)), 'w+') as f:
        for target in linkedTargets:
            print(target.title)
            try:
                for filename in os.listdir(
                        os.path.join(ARCHIVE_BASEDIR, target.targetType,
                                     target.title[0], target.title)):

                    if filename.endswith(".gz"):
                        for line in gzip.open(
                                os.path.join(ARCHIVE_BASEDIR,
                                             target.targetType,
                                             target.title[0], target.title,
                                             filename)):
                            tweet = json.loads(line.decode('utf-8'))
                            tweetDate = datetime.strptime(
                                tweet['created_at'],
                                '%a %b %d %H:%M:%S +0000 %Y')
                            for tag in tweet['entities']['hashtags']:
                                if tweetDate > dbDateStart and tweetDate < dbDateStop:
                                    tags.append(tag['text'].lower())
            except:
                continue

        counts = Counter(tags)
        for t in counts.most_common(10):
            f.write(str(t))
            f.write('\n')

    addExportRef = models.EXPORTS(url='hashtags_{}.txt'.format(
        q.title, export_uuid),
                                  type='Hashtags',
                                  exported=datetime.utcnow(),
                                  count=None)
    q.exports.append(addExportRef)
    db.session.commit()
    db.session.close()
示例#4
0
def dehydrateCollection(id):
    count = 0
    export_uuid = uuid.uuid4()
    if not os.path.isdir(EXPORTS_BASEDIR):
        os.makedirs(EXPORTS_BASEDIR)
    q = models.COLLECTION.query.filter(models.COLLECTION.row_id == id).first()
    linkedTargets = models.COLLECTION.query. \
        filter(models.COLLECTION.row_id == id). \
        first(). \
        tags
    dbDateStart = q.inclDateStart
    dbDateStop = q.inclDateEnd
    with open(
            os.path.join(EXPORTS_BASEDIR,
                         'dehydrate_{}.txt'.format(export_uuid)), 'w+') as f:
        for target in linkedTargets:
            print(target.title)
            for filename in os.listdir(
                    os.path.join(ARCHIVE_BASEDIR, target.targetType,
                                 target.title[0], target.title)):
                if filename.endswith(".gz"):
                    for line in gzip.open(
                            os.path.join(ARCHIVE_BASEDIR, target.targetType,
                                         target.title[0], target.title,
                                         filename)):
                        try:
                            tweet = json.loads(line.decode('utf-8'))
                            tweetDate = datetime.strptime(
                                tweet['created_at'],
                                '%a %b %d %H:%M:%S +0000 %Y')
                        except:
                            continue

                        if tweetDate > dbDateStart and tweetDate < dbDateStop:
                            count = count + 1
                            f.write(tweet['id_str'])
                            f.write('\n')

    addExportRef = models.EXPORTS(url='dehydrate_{}.txt'.format(export_uuid),
                                  type='Dehydrate',
                                  exported=datetime.now(),
                                  count=count)
    q.exports.append(addExportRef)
    db.session.commit()
    db.session.close()
示例#5
0
def bagger(id):
    q = models.TWITTER.query.filter(models.TWITTER.row_id == id).first()
    dest = os.path.join(EXPORTS_BASEDIR, q.title)
    shutil.copytree(
        os.path.join(ARCHIVE_BASEDIR, q.targetType, q.title[0], q.title), dest)
    bag = bagit.make_bag(
        dest, {
            'target-type': q.targetType,
            'title': q.title,
            'search-string': q.searchString,
            'search-language': q.searchLang,
            'description': q.description,
            'keywords': q.subject
        })
    make_archive(dest, os.path.join(EXPORTS_BASEDIR, '{}.zip'.format(q.title)))
    shutil.rmtree(dest)
    addExportRef = models.EXPORTS(url='{}.zip'.format(q.title),
                                  type='Bag',
                                  exported=datetime.now(),
                                  count=None)
    q.exports.append(addExportRef)
    db.session.commit()
    db.session.close()
示例#6
0
def Followers(id):
    CREDENTIALS = models.CREDENTIALS.query.one()
    with app.test_request_context():
        t = twarc.Twarc(consumer_key=CREDENTIALS.consumer_key,
                    consumer_secret=CREDENTIALS.consumer_secret,
                    access_token=CREDENTIALS.access_token,
                    access_token_secret=CREDENTIALS.access_secret)
        count = 0
        export_uuid = uuid.uuid4()
        if not os.path.isdir(EXPORTS_BASEDIR):
            os.makedirs(EXPORTS_BASEDIR)
        q = models.TWITTER.query.filter(models.TWITTER.row_id == id).first()
        x = t.follower_ids(q.title)
        with open(os.path.join(EXPORTS_BASEDIR, 'followers_{}.txt'.format(export_uuid)), 'w+') as f:

            for u in x:
                    count = count + 1
                    f.write(u)
                    f.write('\n')
        addExportRef = models.EXPORTS(url='followers_{}.txt'.format(export_uuid),type='Followers',exported=datetime.now(),count=count)
        q.exports.append(addExportRef)
        db.session.commit()
        db.session.close()
示例#7
0
def networkUserSearch(id, users, retweets, min_subgraph_size,
                      max_subgraph_size, output):
    G = networkx.Graph()

    def add(from_user, from_id, to_user, to_id, type):
        "adds a relation to the graph"

        if users and to_user:
            G.add_node(from_user, screen_name=from_user)
            G.add_node(to_user, screen_name=to_user)
            G.add_edge(from_user, to_user, type=type)

        elif not users and to_id:
            G.add_node(from_id, screen_name=from_user, type=type)
            if to_user:
                G.add_node(to_id, screen_name=to_user)
            else:
                G.add_node(to_id)
            G.add_edge(from_id, to_id, type=type)

    def to_json(g):
        j = {"nodes": [], "links": []}
        for node_id, node_attrs in g.nodes(True):
            j["nodes"].append({
                "id": node_id,
                "type": node_attrs.get("type"),
                "screen_name": node_attrs.get("screen_name")
            })
        for source, target, attrs in g.edges(data=True):
            j["links"].append({
                "source": source,
                "target": target,
                "type": attrs.get("type")
            })
        return j

    count = 0
    export_uuid = uuid.uuid4()
    if not os.path.isdir(EXPORTS_BASEDIR):
        os.makedirs(EXPORTS_BASEDIR)
    q = models.TWITTER.query.filter(models.TWITTER.row_id == id).first()

    for filename in os.listdir(
            os.path.join(ARCHIVE_BASEDIR, q.targetType, q.title[0], q.title)):
        if filename.endswith(".gz"):
            for line in gzip.open(
                    os.path.join(ARCHIVE_BASEDIR, q.targetType, q.title[0],
                                 q.title, filename)):

                try:
                    t = json.loads(line.decode('utf-8'))
                except:
                    continue
                from_id = t['id_str']
                from_user = t['user']['screen_name']
                from_user_id = t['user']['id_str']
                to_user = None
                to_id = None
                count = count + 1
                if users:
                    for u in t['entities'].get('user_mentions', []):
                        add(from_user, from_id, u['screen_name'], None,
                            'reply')
                else:

                    if t.get('in_reply_to_status_id_str'):
                        to_id = t['in_reply_to_status_id_str']
                        to_user = t['in_reply_to_screen_name']
                        add(from_user, from_id, to_user, to_id, "reply")

                    if t.get('quoted_status'):
                        to_id = t['quoted_status']['id_str']
                        to_user = t['quoted_status']['user']['screen_name']
                        to_user_id = t['quoted_status']['user']['id_str']
                        add(from_user, from_id, to_user, to_id, "quote")
                    if retweets and t.get('retweeted_status'):
                        to_id = t['retweeted_status']['id_str']
                        to_user = t['retweeted_status']['user']['screen_name']
                        to_user_id = t['retweeted_status']['user']['id_str']
                        add(from_user, from_id, to_user, to_id, "retweet")
    if min_subgraph_size or max_subgraph_size:
        g_copy = G.copy()
        for g in networkx.connected_component_subgraphs(G):
            if min_subgraph_size and len(g) < min_subgraph_size:
                g_copy.remove_nodes_from(g.nodes())
            elif max_subgraph_size and len(g) > max_subgraph_size:
                g_copy.remove_nodes_from(g.nodes())
        G = g_copy

    if output == 'gexf':
        networkx.write_gexf(
            G, os.path.join(EXPORTS_BASEDIR,
                            'gexf_{}.gexf'.format(export_uuid)))
        addExportRef = models.EXPORTS(url='gexf_{}.gexf'.format(export_uuid),
                                      type='Network(gexf)',
                                      exported=datetime.now(),
                                      count=count)
        q.exports.append(addExportRef)
        db.session.commit()
        db.session.close()

    elif output == ("json"):
        json.dump(to_json(G),
                  open(
                      os.path.join(EXPORTS_BASEDIR,
                                   'json_{}.json'.format(export_uuid)), "w"),
                  indent=2)
        addExportRef = models.EXPORTS(url='json_{}.json'.format(export_uuid),
                                      type='Network(json)',
                                      exported=datetime.now(),
                                      count=count)
        q.exports.append(addExportRef)
        db.session.commit()
        db.session.close()

    elif output == 'html':
        addExportRef = models.EXPORTS(url='html_{}.html'.format(export_uuid),
                                      type='Network(html)',
                                      exported=datetime.now(),
                                      count=count)
        q.exports.append(addExportRef)
        db.session.commit()
        db.session.close()
        graph_data = json.dumps(to_json(G), indent=2)
        html = """<!DOCTYPE html>
        <meta charset="utf-8">
        <script src="https://platform.twitter.com/widgets.js"></script>
        <script src="https://d3js.org/d3.v4.min.js"></script>
        <script src="https://code.jquery.com/jquery-3.1.1.min.js"></script>
        <style>

        .links line {
          stroke: #999;
          stroke-opacity: 0.8;
          stroke-width: 2px;
        }

        line.reply {
          stroke: #999;
        }

        line.retweet {
          stroke-dasharray: 5;
        }

        line.quote {
          stroke-dasharray: 5;
        }

        .nodes circle {
          stroke: red;
          fill: red;
          stroke-width: 1.5px;
        }

        circle.retweet {
          fill: white;
          stroke: #999;
        }

        circle.reply {
          fill: #999;
          stroke: #999;
        }

        circle.quote {
          fill: yellow;
          stroke: yellow;
        }

        #graph {
          width: 99vw;
          height: 99vh;
        }

        #tweet {
          position: absolute;
          left: 100px;
          top: 150px;
        }

        </style>
        <svg id="graph"></svg>
        <div id="tweet"></div>
        <script>

        var width = $(window).width();
        var height = $(window).height();

        var svg = d3.select("svg")
            .attr("height", height)
            .attr("width", width);

        var color = d3.scaleOrdinal(d3.schemeCategory20c);

        var simulation = d3.forceSimulation()
            .velocityDecay(0.6)
            .force("link", d3.forceLink().id(function(d) { return d.id; }))
            .force("charge", d3.forceManyBody())
            .force("center", d3.forceCenter(width / 2, height / 2));

        var graph = %s;

        var link = svg.append("g")
            .attr("class", "links")
          .selectAll("line")
          .data(graph.links)
          .enter().append("line")
            .attr("class", function(d) { return d.type; });

        var node = svg.append("g")
            .attr("class", "nodes")
          .selectAll("circle")
          .data(graph.nodes)
          .enter().append("circle")
            .attr("r", 5)
            .attr("class", function(d) { return d.type; })
            .call(d3.drag()
                .on("start", dragstarted)
                .on("drag", dragged)
                .on("end", dragended));

        node.append("title")
            .text(function(d) { return d.id; });

        node.on("click", function(d) {
          $("#tweet").empty();

          var rect = this.getBoundingClientRect();
          var paneHeight = d.type == "retweet" ? 50 : 200;
          var paneWidth = d.type == "retweet" ? 75 : 500;

          var left = rect.x - paneWidth / 2;
          if (rect.y > height / 2) {
            var top = rect.y - paneHeight;
          } else {
            var top = rect.y + 10;
          }

          var tweet = $("#tweet");
          tweet.css({left: left, top: top});

          if (d.type == "retweet") {
            twttr.widgets.createFollowButton(d.screen_name, tweet[0], {size: "large"});
          } else {
            twttr.widgets.createTweet(d.id, tweet[0], {conversation: "none"});
          }

          d3.event.stopPropagation();

        });

        svg.on("click", function(d) {
          $("#tweet").empty();
        });

        simulation
            .nodes(graph.nodes)
            .on("tick", ticked);

        simulation.force("link")
            .links(graph.links);

        function ticked() {
          link
              .attr("x1", function(d) { return d.source.x; })
              .attr("y1", function(d) { return d.source.y; })
              .attr("x2", function(d) { return d.target.x; })
              .attr("y2", function(d) { return d.target.y; });

          node
              .attr("cx", function(d) { return d.x; })
              .attr("cy", function(d) { return d.y; });
        }

        function dragstarted(d) {
          if (!d3.event.active) simulation.alphaTarget(0.3).restart();
          d.fx = d.x;
          d.fy = d.y;
        }

        function dragged(d) {
          d.fx = d3.event.x;
          d.fy = d3.event.y;
        }

        function dragended(d) {
          if (!d3.event.active) simulation.alphaTarget(0);
          d.fx = null;
          d.fy = null;
        }

        </script>
        """ % graph_data
        open(os.path.join(EXPORTS_BASEDIR, 'html_{}.html'.format(export_uuid)),
             "w").write(html)