Пример #1
0
 def __init__(self, *args, **kwargs):
     session = cache.get_session(timeout=1.5)
     session.params['api_key'] = libraries_io_api_key
     super(self.__class__, self).__init__('https://libraries.io/api',
                                          session=session,
                                          *args,
                                          **kwargs)
Пример #2
0
 def __init__(self, *args, **kwargs):
     super(self.__class__,
           self).__init__('https://api.github.com',
                          auth=(gh_username, gh_password),
                          session=cache.get_session(timeout=1.0),
                          *args,
                          **kwargs)
Пример #3
0
 def __init__(self, *args, **kwargs):
     super(self.__class__, self).__init__(
         'https://api.github.com',
         auth=(gh_username, gh_password),
         session=cache.get_session(timeout=1.0),
         *args, **kwargs
     )
Пример #4
0
 def __init__(self, *args, **kwargs):
     session = cache.get_session(timeout=1.5)
     session.params['api_key'] = libraries_io_api_key
     super(self.__class__, self).__init__(
         'https://libraries.io/api',
         session=session,
         *args, **kwargs
     )
Пример #5
0
def get_records(query):
    """
    Fetch records for a Craigslist search for a query
    """
    # Get the latest page in our neighborhood
    resp = cache.get_session().get("http://sfbay.craigslist.org/search/eby/sss", params={
        'query': urllib.quote_plus(query),
    })
    soup = Soup(resp.text, "html.parser")

    # Go through each of the search results
    records = []
    for row in soup.select('li.result-row'):

        # Get the date of the post
        timestamp = row.find('time')['datetime']
        date_time = datetime.fromtimestamp(mktime(
            strptime(timestamp, "%Y-%m-%d %H:%M")
        ))

        # ... link ....
        link = row.find('a', attrs={'class': 'hdrlnk'})
        href = link['href']
        desc = link.text

        # ... item price ...
        prices = row.select('.result-price')
        price = "?"
        if prices:
            price = prices[0].text.replace('$', '')

        # ... where the item's at ...
        neighborhoods = row.select('.result-hood')
        neighborhood = "?"
        if neighborhoods:
            neighborhood = re.search(r'\((.*)\)', neighborhoods[0].text).group(1)

        # Only save the record if it's from the last N days
        if abs((date_time - datetime.now()).days) <= 2:
            records.append([
                '%d/%d' % (date_time.month, date_time.day), desc[:50] + "...",
                price, neighborhood, href,
            ])

    return records
Пример #6
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals
import logging
import ConfigParser
from requests.auth import AuthBase
import slumber
import os.path
import cache

logging.basicConfig(level=logging.INFO, format="%(message)s")
default_requests_session = cache.get_session(timeout=1)
default_requests_session.headers['User-Agent'] =\
    "Andrew Head (for academic analysis) <[email protected], Austin Le (for academic" +\
    " analysis) <*****@*****.**>"

lib_config = ConfigParser.ConfigParser()
lib_config.read(os.path.expanduser(os.path.join('~', '.libraries_config')))
libraries_io_api_key = lib_config.get('api', 'API_KEY')


class LibrariesIoAuth(AuthBase):
    def __init__(self, api_key):
        self.api_key = api_key

    def __call__(self, request):
        request.data['api_key'] = self.api_key
        return request

Пример #7
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals
import logging
import argparse
import cache
import peewee
from models import Bigram, Trigram, create_tables


logging.basicConfig(level=logging.INFO, format="%(message)s")
session = cache.get_session()
TARGET_TAGS = ['wget', 'regex', 'css-selectors', 'jquery']


def make_url(tags):
    tag_string = ';'.join(tags)
    return 'https://api.stackexchange.com/2.2/tags/{tags}/related'.format(tags=tag_string)


def fetch_bigrams():
    for tag in TARGET_TAGS:
        resp = session.get(make_url([tag]), params={
            'pagesize': 100,
            'site': 'stackoverflow',
        })
        respJson = resp.json()
        for i in respJson['items']:
            bg, _ = Bigram.get_or_create(tag1=tag, tag2=i['name'])
            bg.count = i['count']
Пример #8
0
            for tag in soup.children:
                if tag.name == 'pre':
                    rfile.write(tag.text + "\n")
                    rfile.write("--------------\n")


def question_lines_to_file(results):

    with codecs.open(QUESTION_LINES, 'w', encoding='utf8') as lfile:
        for r in results:
            soup = Soup(r['body'])
            for tag in soup.children:
                if tag.name == 'pre':
                    text = re.sub(r'\\s*\n', '',
                                  tag.text)  # remove line continuations
                    lines = text.split('\n')
                    for line in lines:
                        if re.match("^(.*\$)?\s*wget", line):
                            line = re.sub("^.*\$\s*", "", line)
                            lfile.write(line.strip() + '\n')


if __name__ == '__main__':
    session = cache.get_session(timeout=1.0)
    questions = fetch_questions(session, 'wget')
    answers = fetch_answers(session, questions)
    answer_results_to_file(answers)
    answer_lines_to_file(answers)
    question_results_to_file(questions)
    question_lines_to_file(questions)
Пример #9
0
            for tag in soup.children:
                if tag.name == 'pre':
                    rfile.write(tag.text + "\n")
                    rfile.write("--------------\n")


def question_lines_to_file(results):

    with codecs.open(QUESTION_LINES, 'w', encoding='utf8') as lfile:
        for r in results:
            soup = Soup(r['body'])
            for tag in soup.children:
                if tag.name == 'pre':
                    text = re.sub(r'\\s*\n', '', tag.text)  # remove line continuations
                    lines = text.split('\n')
                    for line in lines:
                        if re.match("^(.*\$)?\s*wget", line):
                            line = re.sub("^.*\$\s*", "", line)
                            lfile.write(line.strip() + '\n')


if __name__ == '__main__':
    session = cache.get_session(timeout=1.0)
    questions = fetch_questions(session, 'wget')
    answers = fetch_answers(session, questions)
    answer_results_to_file(answers)
    answer_lines_to_file(answers)
    question_results_to_file(questions)
    question_lines_to_file(questions)

Пример #10
0
import tokenize
import re
from StringIO import StringIO
import argparse
from progressbar import ProgressBar, Percentage, Bar, RotatingMarker, ETA, Counter

import models
from models import Page, Snippet, Token, Comment, SnippetComment, SnippetToken
from sites import SITES


logging.basicConfig(level=logging.INFO, format="%(message)s")
HEADER_TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6']
TEXT_TAGS = ['p', 'div']
NONTEXT_TAGS = ['script']
requests_session = cache.get_session()


def extract_code(node):
    code_text = node.text
    code_text = re.sub('^>>> ', '', code_text, flags=re.MULTILINE)
    code_text = re.sub('^\.\.\. ', '', code_text, flags=re.MULTILINE)
    return code_text


def is_text(text):
    # The heuristic we use here is that we only consider a string to be a textual description
    # if it contains at least two consecutive alphabetic letters
    if not re.match('[A-Za-z]{2}', text):
        return False
    return True
Пример #11
0
#! /usr/bin/env python
# -*- coding: utf-8 -*-

from __future__ import unicode_literals
import logging
import ConfigParser
import slumber
import os.path
import cache


logging.basicConfig(level=logging.INFO, format="%(message)s")
default_requests_session = cache.get_session(timeout=1)
default_requests_session.headers['User-Agent'] =\
    "Austin Le (for academic analysis) <*****@*****.**>"

gh_config = ConfigParser.ConfigParser()
gh_config.read(os.path.expanduser(os.path.join('~', '.github', 'github.cfg')))
gh_username = gh_config.get('auth', 'username')
gh_password = gh_config.get('auth', 'password')


class Github(slumber.API):
    def __init__(self, *args, **kwargs):
        super(self.__class__, self).__init__(
            'https://api.github.com',
            auth=(gh_username, gh_password),
            session=cache.get_session(timeout=1.0),
            *args, **kwargs
        )