Python get_links示例，links.get_links Python示例

示例#1

0

显示文件

def insert_links(date, version, con):
    #Insert links into DB
    with con:
        links_cur = con.cursor()
        links_gen = links.get_links(date, version)
        for link in links_gen:
            links_cur.execute(
                "INSERT INTO links VALUES(%s, %s, %s, %s, %s, %s, %s)", link)

示例#2

0

显示文件

文件： utils.py 项目： 2dotstwice/dlcli

def backup_account(url='', org='', key='', account='', backupdir='', **kwargs):
    #  create directory structure
    backup_dir = create_dir(os.getcwd(), backupdir)
    org_dir = create_dir(backup_dir, org)
    account_dir = create_dir(org_dir, account)

    # backup agents
    agent_dir = create_dir(account_dir, 'agents')
    for agent_json in agents.get_agents(url=url, org=org, account=account, key=key):
        agent_path = os.path.join(agent_dir, str(agent_json['name']) + '.json')
        remove_keys = ['presence_state', 'created', 'modified', 'heartbeat']
        for k in remove_keys:
            if k in agent_json:
                del agent_json[k]
        with open(agent_path, 'w') as f:
            f.write(json.dumps(agent_json, indent=4))

    # backup dashboards
    dashboard_dir = create_dir(account_dir, 'dashboards')
    for d in dashboards.get_dashboards(url=url, org=org, account=account, key=key):
        dashboard_path = os.path.join(dashboard_dir, str(d['name']) + '.yaml')
        with open(dashboard_path, 'w') as f:
            f.write(yaml.safe_dump(d, default_flow_style=False, explicit_start=True))

    # backup plugins
    plugin_dir = create_dir(account_dir, 'plugins')
    for p in plugins.get_plugins(url=url, org=org, account=account, key=key):
        plugin_path = os.path.join(plugin_dir, str(p['name']) + '.' + str(p['extension']))
        with open(plugin_path, 'w') as f:
            f.write(plugins.export_plugin(plugin=p['name'], url=url, org=org, account=account, key=key))


    # backup rules
    rule_dir = create_dir(account_dir, 'rules')
    for r in rules.get_rules(url=url, org=org, account=account, key=key):
        rule_path = os.path.join(rule_dir, str(r['name']) + '.yaml')
        with open(rule_path, 'w') as f:
            rule_content = yaml.safe_load(rules.export_rule(rule=r['id'], url=url, org=org, account=account, key=key))
            if rule_content['actions']:
                action_count = len(rule_content['actions'])
                for i in range(action_count):
                    try:
                        del rule_content['actions'][i]['details']['status']
                    except KeyError:
                        continue
            f.write(yaml.safe_dump(rule_content, default_flow_style=False, explicit_start=True))

    # backup links
    link_dir = create_dir(account_dir, 'links')
    for l in links.get_links(url=url, org=org, account=account, key=key):
        link_path = os.path.join(link_dir, l['id'] + '.json')
        link_json = links.export_link(link_id=l['id'], url=url, org=org, account=account, key=key)
        with open(link_path, 'w') as f:
            f.write(json.dumps(link_json, indent=4))

示例#3

0

显示文件

文件： links_test.py 项目： UbadahJ/code-n-stitch

 def test_get_links_code(self):
     text = read_file("code.js")
     self.assertEqual(
         [
             "https://stackoverflow.com/a/57804949",
             "http://www.google.com",
             "http://www.mylink.com",
             "http://www.yourlink.com",
             "http://www.test.com",
             "http://www.facebook.com",
         ],
         get_links(text),
     )

示例#4

0

显示文件

文件： data_collect.py 项目： arturg87/Predict_TV_Show_Cancelation

from bs4 import BeautifulSoup
import urllib3
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

## Get list of canceled/renewed TV shows
http = urllib3.PoolManager()

url = 'https://www.metacritic.com/feature/tv-renewal-scorecard-2017-2018-season'

fp = http.request('GET', url)
soup = BeautifulSoup(fp.data, features='lxml')
data = soup.find_all('p', {'class': 'medium'})

func = functions.get_data()
links = links.get_links()

#links.get_IMDb_links('$100,000 Pyramid')

# Parse webpage to create a list of canceled\renewed\rescued tv shows
for text in data:
    if "has renewed" in text.text or "renewed for" in text.text:
        func.is_renewed(text)
    elif "canceled" in text.text:
        func.is_canceled(text)
    if "rescued" in text.text:
        func.is_rescued(text)

# If show was rescued, remove from canceled list if there
for i in func.removeFromCanceled:
    func.canceled = [

示例#5

0

显示文件

文件： Crawler.py 项目： flykeysky/Crawler

import links
import DB_connect
from datetime import date, timedelta
from json import JSONDecoder

Days = 1

while (Days < 2920):  # 365*8 = 2920 as the website got news only till 2006

    url = "http://www.bloomberg.com/archive/news/"
    d = date.today() - timedelta(
        days=Days
    )  # We start extracting data from a date before today due to time zone
    d = d.strftime('%Y-%m-%d')
    url = url + d + "/"
    list_links = links.get_links(
        url)  # It gets Articles published on any single day

    total_links = len(list_links)  # Total Articles published on a single day
    print "Total articles for date: "
    print d, " are: ", total_links

    #Fetches the Article Details by parsing each article page and then return the Data in JSON Format to be stored in DB

    for index in range(0, total_links):
        print list_links[index]
        data = links.get_articles(list_links[index])
        data = JSONDecoder().decode(data)

        db = DB_connect.mongo_insert(data)  # Data inserted in DB

    # After fetching all articles of a particular day the day is increased by one so the date shifts  to a previous day.

示例#6

0

显示文件

文件： links_test.py 项目： UbadahJ/code-n-stitch

 def test_get_links(self):
     text = read_file("text.txt")
     self.assertEqual(
         ["http://www.example.org/bag.aspx", "http://example.com/bat.html"],
         get_links(text),
     )

示例#7

0

显示文件

文件： Crawler.py 项目： akifkhan/Crawler

import links
import DB_connect
from datetime import date, timedelta
from json import JSONDecoder



Days=1

while(Days < 2920):								# 365*8 = 2920 as the website got news only till 2006

	url="http://www.bloomberg.com/archive/news/"										
	d=date.today()-timedelta(days=Days)			# We start extracting data from a date before today due to time zone 
	d=d.strftime('%Y-%m-%d')
	url=url+d+"/"
	list_links = links.get_links(url)			# It gets Articles published on any single day
	
	total_links = len(list_links)				# Total Articles published on a single day
	print "Total articles for date: " 
	print d, " are: ", total_links

	
	#Fetches the Article Details by parsing each article page and then return the Data in JSON Format to be stored in DB 
	
	for index in range(0,total_links):	
		print list_links[index]
		data = links.get_articles(list_links[index])
		data = JSONDecoder().decode(data)
		
		db=DB_connect.mongo_insert(data) # Data inserted in DB

示例#8

0

显示文件

def backup_account(url='', org='', key='', account='', backup_dir='', **kwargs):
    #  create directory structure
    backup_dir = create_dir(os.getcwd(), backup_dir)
    org_dir = create_dir(backup_dir, org)
    account_dir = create_dir(org_dir, account)

    # backup agents
    agent_dir = create_dir(account_dir, 'agents')
    for agent in agents.get_agents(url=url, org=org, account=account, key=key):
        logging.debug('Exporting JSON for agent "%s"', agent['name'])
        # some agents can have a name 'http://...' encode name before writing a dir
        agent_path = os.path.join(agent_dir, str(urllib.quote(agent['name'], safe='')) + '.json')
        remove_keys = ['presence_state', 'created', 'modified', 'heartbeat']
        for k in remove_keys:
            if k in agent:
                del agent[k]
        with open(agent_path, 'w') as f:
            f.write(json.dumps(agent, indent=4))

    # backup dashboards
    dashboard_dir = create_dir(account_dir, 'dashboards')
    for dash in dashboards.get_dashboards(url=url, org=org, account=account, key=key):
        logging.debug('Exporting YAML for dashboard "%s"', dash['name'])
        dashboard_path = os.path.join(dashboard_dir, str(dash['name']) + '.yaml')
        with open(dashboard_path, 'w') as f:
            f.write(yaml.safe_dump(dash, default_flow_style=False, explicit_start=True))

    # backup plugins
    plugin_dir = create_dir(account_dir, 'plugins')
    for plugin in plugins.get_plugins(url=url, org=org, account=account, key=key):
        logging.debug('Exporting plugin "%s"', plugin['name'])
        plugin_path = os.path.join(plugin_dir, str(plugin['name']) + '.' + str(plugin['extension']))
        with open(plugin_path, 'w') as f:
            f.write(plugins.export_plugin(plugin=plugin['name'], url=url, org=org, account=account, key=key))


    # backup rules
    rule_dir = create_dir(account_dir, 'rules')
    for rule in rules.get_rules(url=url, org=org, account=account, key=key):
        logging.debug('Exporting YAML for rule "%s" with id %s', rule['name'], rule['id'])
        rule_path = os.path.join(rule_dir, str(rule['name']) + '.yaml')
        with open(rule_path, 'w') as f:
            rule_yaml = rules.export_rule(rule=rule['id'], url=url, org=org, account=account, key=key)
            try:
                rule_content = yaml.safe_load(rule_yaml)
                if rule_content['actions']:
                    action_count = len(rule_content['actions'])
                    for i in range(action_count):
                        try:
                            del rule_content['actions'][i]['details']['status']
                        except KeyError:
                            continue
                f.write(yaml.safe_dump(rule_content, default_flow_style=False, explicit_start=True))
            except yaml.YAMLError as e:
                logging.warn('Unable to parse YAML for rule %s: %s', rule['name'], e.problem)
                f.write(rule_yaml)

    # backup links
    link_dir = create_dir(account_dir, 'links')
    for link in links.get_links(url=url, org=org, account=account, key=key):
        logging.debug('Exporting JSON for pack "%s" with id %s', link['plugin'], link['id'])
        link_path = os.path.join(link_dir, link['id'] + '.json')
        link_json = links.export_link(link_id=link['id'], url=url, org=org, account=account, key=key)
        with open(link_path, 'w') as f:
            f.write(json.dumps(link_json, indent=4))

示例#9

0

显示文件

 def generate(self):
     self.out_text.delete(1.0, END)
     self.out_text.insert(
         END, "\n".join(get_links(self.input_text.get(1.0, END))))

示例#10

0

显示文件

文件： debug.py 项目： kriskowal/3rin.gs

# canonical names that don't
#  have a row in locations.tsv
#  have a label in latin letters
#  have a label in the tengwar
#  have a region in regions.svg

from locations import locations as get_locations, CANONICAL, SINDARIN, ROMAN, OTHER

locations = get_locations()
from regions import regions2 as get_regions

regions = get_regions()
from links import links as get_links

links = get_links()
from names import names as get_names

names = get_names()
from labels import abnormal_labels as get_abnormal_labels, normalized_labels as get_normalized_labels

normalized_labels = [label for name, label in get_normalized_labels()]
abnormal_labels = [label for name, label in get_abnormal_labels()]
postpone_canonical_names = set(name.strip() for name in open("postpone.txt"))


def canonical_names():
    canonical_names = set()

    source_pairs = (
        # ('location', set(zip(*locations)[0])),

示例#11

0

显示文件

from bs4 import BeautifulSoup as bs
import requests
from links import get_links

filename = 'timesofindia_tweets.csv'
targets = get_links(filename)
'''
format for data:
{
'source' : 'times_of_india',
'title' : "",
'paragraphs':[""],
'publication':"<timestamp>",
'category':"",
'tags':""
}
'''
for target in targets:
    row = {
        'source': 'times_of_india',
        'title': "",
        'paragraphs': [""],
        'publication': "",
        'category': "",
        'tags': ""
    }
    row['publication'] = target['datetime']
    req = requests.get(target['link'])
    soup = bs(req.text, 'html.parser')
    heading = soup.find_all('h1')
    for i in heading: