Exemplo n.º 1
0
### Set publication-specific variables
pubshort = "cpd"  # Short name for the publication
pubmv_external = None  # Do we need to get the list of most viewed items from another page? (None = No, anything else = Yes)
pattern = "/(\d+)/(\d+)/(.+).html?|/news/article/"  # Pattern for defining what is and what is not a news item
pub_tz = "US/Eastern"  # Timezone the publication is in
move_on_success = None  # Do we want to move files on success? (None = No, anything else = Yes)
success_dir = "success/"  # Directory for storing successful files (as subdirectory of data directory)

########### Load libraries
import parserfunctions
import re
from bs4 import BeautifulSoup

### Grab the information from our configuration file
config = parserfunctions.load_config()
homepages_dir = parserfunctions.homepages_dir(pubshort)
link_pattern = re.compile(pattern)

### Establish our MySQL Connection (for logging, etc.)
conn, cur, mysql_table_name, mysql_log_name = parserfunctions.create_mysql_conn(
    config)

### Create directory for success, if appropriate
parserfunctions.create_success_dir(pubshort, homepages_dir, move_on_success)

### Get list of files to parse
file_list, file_list_len = parserfunctions.get_file_list(
    pubshort, homepages_dir)
i = 1

### For each desktop homepage
Exemplo n.º 2
0
pubmv_external = None # Do we need to get the list of most viewed items from another page? (None = No, anything else = Yes)
pattern = "/(.+)/(\d+)_(.+).html?|/(.+)/(\d+)/(\d+)/|projects\.(.+\.com)/(\d+)/|video\.(.+\.com)/(\d+)|/(.+)/photogalleries/(.+)(\d+)|/gamecenter/" # Link pattern for actual articles
pub_tz = "US/Pacific" # Timezone the publication is in

process_desktop = 1 # Do we want to process the desktop pages? (None = No, anything else = Yes)
move_on_success = None # Do we want to move files on success? (None = No, anything else = Yes)
success_dir = "success/" # Directory for storing successful files (as subdirectory of data directory)

########### Load libraries
import parserfunctions
import re
from bs4 import BeautifulSoup

### Grab the information from our configuration file
config = parserfunctions.load_config()
homepages_dir = parserfunctions.homepages_dir(pubshort)
link_pattern = re.compile(pattern)

### Establish our MySQL Connection (for logging, etc.)
conn, cur, mysql_table_name, mysql_log_name = parserfunctions.create_mysql_conn(config)

### Create directory for success, if appropriate
parserfunctions.create_success_dir(pubshort, homepages_dir, move_on_success)

########### Parse Desktop Pages
if process_desktop is not None:
    ### Get list of files to parse
    file_list, file_list_len = parserfunctions.get_file_list(pubshort, homepages_dir)
    i = 1
    
    ### For each desktop homepage