Пример #1
0
def parse(page):
    try:
        t = eparse(page)
    except lxml.etree.ParserError:
        print sys.stderr, "page content error"
        return info

    info = {}
Пример #2
0
def parse(page):
    try:
       t=eparse(page)
    except lxml.etree.ParserError:
       print sys.stderr, "page content error"
       return info

    info = {}
Пример #3
0
def parse_csdn(page):
    try:
        t = eparse(page)
    except lxml.etree.ParserError:
        print sys.stderr, "page content error"
        return info
    r = []
    sel = []
    find = t.cssselect('div[class="per_dynamic"]')
    if find:
        titles = find[0].xpath('//a[contains(@href,"http:")]')
        for li in titles:
            r.append(li.text_content().strip().replace('\n', ' '))
    print "======", r
    sel2 = t.cssselect('div[class="position  education vevent vcard"]')
    for li in sel2:
        sel.append(li)

    for li in sel:
        item = {}
        find = li.cssselect('h3[class="summary fn org"]')
        if find:
            item['school'] = find[0].text_content().strip().replace('\n', ' ')

        find = li.cssselect('span[class="degree"]')
        if find:
            item['degree'] = find[0].text_content().strip().replace('\n', ' ')

        find = li.cssselect('span[class="major"]')
        if find:
            item['major'] = find[0].text_content().strip().replace('\n', ' ')

        find = li.cssselect('abbr[class="dtstart"]')
        if find:
            item['dtstart'] = find[0].text_content().strip().replace('\n', ' ')
        find = li.cssselect('abbr[class="dtstamp"]')
        if find:
            item['dtend'] = find[0].text_content().strip().replace('\n', ' ')

        find = li.cssselect('abbr[class="dtend"]')
        if find:
            item['dtend'] = find[0].text_content().strip().replace('\n', ' ')

        find = li.cssselect('p[class=" desc details-education"]')
        if find:
            item['desc'] = find[0].text_content().strip().replace('\n', ' ')

        find = li.cssselect('p[class="desc details-education"]')
        if find:
            item['activities'] = find[0].text_content().strip().replace(
                '\n', ' ')

        r.append(item)
    return r
Пример #4
0
def parse(page):
    info = {}
    try:
        t=eparse(page)
    except lxml.etree.ParserError:
       print sys.stderr, "page content error"
       return info
    info = parse_profile(t)
    dynamic = parse_dynamic(t)
    info['dynamic'] = dynamic
    return info
Пример #5
0
def parse_dir(t):
    r = []
    try:
        t=eparse(page)
    except lxml.etree.ParserError:
       print sys.stderr, "page content error"
       return r
    sel = t.cssselect('h2 strong a')
    for li in sel:
       r.append(li.attrib['href'])
    return r
Пример #6
0
def parse_csdn(page):
    try:
        t=eparse(page)
    except lxml.etree.ParserError:
       print sys.stderr, "page content error"
       return info
    r = []
    sel = []
    find = t.cssselect('div[class="per_dynamic"]')
    if find:
        titles = find[0].xpath('//a[contains(@href,"http:")]')
        for li in titles:
            r.append(li.text_content().strip().replace('\n',' '))
    print "======",r
    sel2 = t.cssselect('div[class="position  education vevent vcard"]')
    for li in sel2:
        sel.append(li)

    for li in sel:
        item = {}
        find = li.cssselect('h3[class="summary fn org"]')
        if find:
            item['school'] = find[0].text_content().strip().replace('\n',' ')

        find = li.cssselect('span[class="degree"]')
        if find:
            item['degree'] = find[0].text_content().strip().replace('\n',' ')

        find = li.cssselect('span[class="major"]')
        if find:
            item['major'] = find[0].text_content().strip().replace('\n',' ')

        find = li.cssselect('abbr[class="dtstart"]')
        if find:
            item['dtstart'] = find[0].text_content().strip().replace('\n',' ')
        find = li.cssselect('abbr[class="dtstamp"]')
        if find:
            item['dtend'] = find[0].text_content().strip().replace('\n',' ')

        find = li.cssselect('abbr[class="dtend"]')
        if find:
            item['dtend'] = find[0].text_content().strip().replace('\n',' ')

        find = li.cssselect('p[class=" desc details-education"]')
        if find:
            item['desc'] = find[0].text_content().strip().replace('\n',' ')

        find = li.cssselect('p[class="desc details-education"]')
        if find:
            item['activities'] = find[0].text_content().strip().replace('\n',' ')

        r.append(item)
    return r
Пример #7
0
def parse(page):
    info = {}
    try:
        t=eparse(page)
    except lxml.etree.ParserError:
       print sys.stderr, "page content error"
       return info
    info = parse_addition(t)
    work = parse_work(t)
    info['work'] = work
    edu = parse_edu(t)
    info['edu'] = edu
    return info
Пример #8
0
def parse(page):
    info = {}
    try:
        t = eparse(page)
    except lxml.etree.ParserError:
        print sys.stderr, "page content error"
        return info
    info = parse_addition(t)
    work = parse_work(t)
    info['work'] = work
    edu = parse_edu(t)
    info['edu'] = edu
    return info