def __init__(self, *args, **kwargs): session = cache.get_session(timeout=1.5) session.params['api_key'] = libraries_io_api_key super(self.__class__, self).__init__('https://libraries.io/api', session=session, *args, **kwargs)
def __init__(self, *args, **kwargs): super(self.__class__, self).__init__('https://api.github.com', auth=(gh_username, gh_password), session=cache.get_session(timeout=1.0), *args, **kwargs)
def __init__(self, *args, **kwargs): super(self.__class__, self).__init__( 'https://api.github.com', auth=(gh_username, gh_password), session=cache.get_session(timeout=1.0), *args, **kwargs )
def __init__(self, *args, **kwargs): session = cache.get_session(timeout=1.5) session.params['api_key'] = libraries_io_api_key super(self.__class__, self).__init__( 'https://libraries.io/api', session=session, *args, **kwargs )
def get_records(query): """ Fetch records for a Craigslist search for a query """ # Get the latest page in our neighborhood resp = cache.get_session().get("http://sfbay.craigslist.org/search/eby/sss", params={ 'query': urllib.quote_plus(query), }) soup = Soup(resp.text, "html.parser") # Go through each of the search results records = [] for row in soup.select('li.result-row'): # Get the date of the post timestamp = row.find('time')['datetime'] date_time = datetime.fromtimestamp(mktime( strptime(timestamp, "%Y-%m-%d %H:%M") )) # ... link .... link = row.find('a', attrs={'class': 'hdrlnk'}) href = link['href'] desc = link.text # ... item price ... prices = row.select('.result-price') price = "?" if prices: price = prices[0].text.replace('$', '') # ... where the item's at ... neighborhoods = row.select('.result-hood') neighborhood = "?" if neighborhoods: neighborhood = re.search(r'\((.*)\)', neighborhoods[0].text).group(1) # Only save the record if it's from the last N days if abs((date_time - datetime.now()).days) <= 2: records.append([ '%d/%d' % (date_time.month, date_time.day), desc[:50] + "...", price, neighborhood, href, ]) return records
#! /usr/bin/env python # -*- coding: utf-8 -*- from __future__ import unicode_literals import logging import ConfigParser from requests.auth import AuthBase import slumber import os.path import cache logging.basicConfig(level=logging.INFO, format="%(message)s") default_requests_session = cache.get_session(timeout=1) default_requests_session.headers['User-Agent'] =\ "Andrew Head (for academic analysis) <[email protected], Austin Le (for academic" +\ " analysis) <*****@*****.**>" lib_config = ConfigParser.ConfigParser() lib_config.read(os.path.expanduser(os.path.join('~', '.libraries_config'))) libraries_io_api_key = lib_config.get('api', 'API_KEY') class LibrariesIoAuth(AuthBase): def __init__(self, api_key): self.api_key = api_key def __call__(self, request): request.data['api_key'] = self.api_key return request
#! /usr/bin/env python # -*- coding: utf-8 -*- from __future__ import unicode_literals import logging import argparse import cache import peewee from models import Bigram, Trigram, create_tables logging.basicConfig(level=logging.INFO, format="%(message)s") session = cache.get_session() TARGET_TAGS = ['wget', 'regex', 'css-selectors', 'jquery'] def make_url(tags): tag_string = ';'.join(tags) return 'https://api.stackexchange.com/2.2/tags/{tags}/related'.format(tags=tag_string) def fetch_bigrams(): for tag in TARGET_TAGS: resp = session.get(make_url([tag]), params={ 'pagesize': 100, 'site': 'stackoverflow', }) respJson = resp.json() for i in respJson['items']: bg, _ = Bigram.get_or_create(tag1=tag, tag2=i['name']) bg.count = i['count']
for tag in soup.children: if tag.name == 'pre': rfile.write(tag.text + "\n") rfile.write("--------------\n") def question_lines_to_file(results): with codecs.open(QUESTION_LINES, 'w', encoding='utf8') as lfile: for r in results: soup = Soup(r['body']) for tag in soup.children: if tag.name == 'pre': text = re.sub(r'\\s*\n', '', tag.text) # remove line continuations lines = text.split('\n') for line in lines: if re.match("^(.*\$)?\s*wget", line): line = re.sub("^.*\$\s*", "", line) lfile.write(line.strip() + '\n') if __name__ == '__main__': session = cache.get_session(timeout=1.0) questions = fetch_questions(session, 'wget') answers = fetch_answers(session, questions) answer_results_to_file(answers) answer_lines_to_file(answers) question_results_to_file(questions) question_lines_to_file(questions)
import tokenize import re from StringIO import StringIO import argparse from progressbar import ProgressBar, Percentage, Bar, RotatingMarker, ETA, Counter import models from models import Page, Snippet, Token, Comment, SnippetComment, SnippetToken from sites import SITES logging.basicConfig(level=logging.INFO, format="%(message)s") HEADER_TAGS = ['h1', 'h2', 'h3', 'h4', 'h5', 'h6'] TEXT_TAGS = ['p', 'div'] NONTEXT_TAGS = ['script'] requests_session = cache.get_session() def extract_code(node): code_text = node.text code_text = re.sub('^>>> ', '', code_text, flags=re.MULTILINE) code_text = re.sub('^\.\.\. ', '', code_text, flags=re.MULTILINE) return code_text def is_text(text): # The heuristic we use here is that we only consider a string to be a textual description # if it contains at least two consecutive alphabetic letters if not re.match('[A-Za-z]{2}', text): return False return True
#! /usr/bin/env python # -*- coding: utf-8 -*- from __future__ import unicode_literals import logging import ConfigParser import slumber import os.path import cache logging.basicConfig(level=logging.INFO, format="%(message)s") default_requests_session = cache.get_session(timeout=1) default_requests_session.headers['User-Agent'] =\ "Austin Le (for academic analysis) <*****@*****.**>" gh_config = ConfigParser.ConfigParser() gh_config.read(os.path.expanduser(os.path.join('~', '.github', 'github.cfg'))) gh_username = gh_config.get('auth', 'username') gh_password = gh_config.get('auth', 'password') class Github(slumber.API): def __init__(self, *args, **kwargs): super(self.__class__, self).__init__( 'https://api.github.com', auth=(gh_username, gh_password), session=cache.get_session(timeout=1.0), *args, **kwargs )