def get_proxies_list():
    global proxies_list
    proxy_list = proxy.fetch()
    proxies_list = [{
        'http':
        'http://%s:%s' % (item.get('ip'), item.get('port')),
        'https':
        'http://%s:%s' % (item.get('ip'), item.get('port'))
    } for item in proxy_list]
def get_proxies_list():
    global proxies_list
    proxy_list = proxy.fetch()
    proxies_list = [{'http': 'http://%s:%s' % (item.get('ip'), item.get('port')), 'https':'http://%s:%s' % (item.get('ip'), item.get('port'))} for item in proxy_list]
示例#3
0
def route_item(name, id):
    """Feed proxy handler"""
    if name in settings.content.feeds:
        return proxy.fetch(settings.content.feeds[name] + '/%s' % id, response)
    return HTTPError(404, "File not found")
示例#4
0
def route_feeds(name):
    """Feed proxy handler"""
    if name in settings.content.feeds:
        return proxy.fetch(settings.content.feeds[name], response)
    return HTTPError(404, "File not found")
示例#5
0
文件: scrapy.py 项目: nibil/scrapy
def readData(inputFile, outputFile):

    ########################
    ## Read Data from CSV ##
    ########################

    f = open(inputFile)
    lines = f.readlines()
    f.close()
    f = open(outputFile, "w")
    count = 0
    writer = csv.DictWriter(f, fieldnames = ["name", "Linkedin", "industry", "location", "title", "languages" , "college1" , "course1", "cperiod1" ,\
                    "college2" , "course2", "cperiod2", "college3" , "course3", "cperiod3", "company1", "designation1", "period1", "company2",\
                    "designation2", "period2", "company3", "designation3", "period3" , "skills"])
    writer.writeheader()
    for lk in lines:
        url = ''
        cc = lk.decode('utf-8').split(",")
        linkedin = []
        try:
            for x in cc:
                if "http" in x:
                    linkedin.append(x)
            url = linkedin[0].replace("\r\n", "")
            data = fetch(url=url)
        except:
            data = ''
        proc = processData(data)
        if proc:
            experience = skills = education = languages = industry = location = title = name = ''
            name = proc.get('name', '')
            experience = proc.get('experience', '')
            skills = ",".join(proc.get('skills', ''))
            education = proc.get('education', '')
            languages = ",".join(proc.get('languages', ''))
            industry = proc.get('industry', '')
            loc = proc.get('location', '')
            if loc:
                location = loc.encode('ascii', 'ignore')
            tit = proc.get('title', '')
            if tit:
                title = tit.encode('ascii', 'ignore')

            college1 = course1 = cperiod1 = college2 = course2 = cperiod2 = college3 = course3 = cperiod3 = company1 = designation1 = period1 = company2 = designation2 = period2 = company3 = designation3 = period3 = " "

            if education:
                try:
                    college1 = education[0].get('institution',
                                                '').encode('ascii', 'ignore')
                    course1 = education[0].get('course',
                                               '').encode('ascii', 'ignore')
                    cperiod1 = education[0].get('span',
                                                '').encode('ascii', 'ignore')
                except:
                    pass

                try:
                    college2 = education[1].get('institution',
                                                '').encode('ascii', 'ignore')
                    course2 = education[1].get('course',
                                               '').encode('ascii', 'ignore')
                    cperiod2 = education[1].get('span',
                                                '').encode('ascii', 'ignore')
                except:
                    pass

                try:
                    college3 = education[2].get('institution',
                                                '').encode('ascii', 'ignore')
                    course3 = education[2].get('course',
                                               '').encode('ascii', 'ignore')
                    cperiod3 = education[2].get('span',
                                                '').encode('ascii', 'ignore')
                except:
                    pass

            if experience:

                try:
                    company1 = experience[0].get('organization',
                                                 '').encode('ascii', 'ignore')
                    designation1 = experience[0].get('position', '').encode(
                        'ascii', 'ignore')
                    period1 = experience[0].get('span',
                                                '').encode('ascii', 'ignore')
                except:
                    pass

                try:
                    company2 = experience[1].get('organization',
                                                 '').encode('ascii', 'ignore')
                    designation2 = experience[1].get('position', '').encode(
                        'ascii', 'ignore')
                    period2 = experience[1].get('span',
                                                '').encode('ascii', 'ignore')
                except:
                    pass

                try:
                    company3 = experience[2].get('organization',
                                                 '').encode('ascii', 'ignore')
                    designation3 = experience[2].get('position', '').encode(
                        'ascii', 'ignore')
                    period3 = experience[2].get('span',
                                                '').encode('ascii', 'ignore')
                except:
                    pass

            writer = csv.writer(f)
            try:
                writer.writerow([name, url, industry, location, title, languages , college1 , course1, cperiod1 ,\
                    college2 , course2, cperiod2, college3 , course3, cperiod3, company1, designation1, period1, company2,\
                    designation2, period2, company3, designation3, period3 , skills])
            except:
                print "Skipped"
                try:
                    writer.writerow(['---', url])
                except:
                    writer.writerow(['---'])
        else:
            try:
                writer = csv.writer(f)
                writer.writerow(['---', url])
            except:
                writer = csv.writer(f)
                writer.writerow(['---'])
        count = count + 1
        print count
        proc = None
    process_completed()
    f.close()
def route_item(name, id):
    """Feed proxy handler"""
    if name in settings.content.feeds:
        return proxy.fetch(settings.content.feeds[name] + '/%s' % id, response)
    return HTTPError(404,"File not found")
def route_feeds(name):
    """Feed proxy handler"""
    if name in settings.content.feeds:
        return proxy.fetch(settings.content.feeds[name], response)
    return HTTPError(404,"File not found")