Python process примеры, utils.process.process Python примеры использования

Пример #1

0

Показать файл

Файл: save_data.py Проект: othmbela/fifa-21-web-scraping

def save_data(data):
    """
    Save the data scrapped by generating a .csv file
    """
    flatten = lambda x: list(chain.from_iterable(x))

    df = pd.DataFrame(flatten(data))
    df = process(df)
    df.to_csv('./data/fifa21.csv', index=False)

Пример #2

0

Показать файл

def sample(checkpoint, length, lstm_size, start=""):
    """
    生成新文本
    
    checkpoint: 某一轮迭代的参数文件
    length: 新文本的字符长度
    lstm_size: 隐层结点数
    start: 起始文本
    """

    data, word2int, int2word, vocab = process(FLAGS.file_path)

    with open("./output/w2i.txt", "w") as f:
        f.write(str(word2int))
    with open("./output/i2w.txt", "w") as f:
        f.write(str(int2word))
    with open("./output/vocab.txt", "w") as f:
        f.write(str(vocab))

    pattern = re.compile("[\u4e00-\u9fa5]")
    match = re.search(pattern, start)
    while (match is None):
        start = int2word[np.random.random_integers(7, len(vocab) - 1)]
        match = re.search(pattern, start)

    print("随机起始文字：%s" % start)
    content = [start]

    # sampling=True意味着batch的size=1 x 1
    model = WordRNN(len(vocab), lstm_size=lstm_size, sampling=True)
    saver = tf.train.Saver()
    with tf.Session() as sess:
        # 加载模型参数，恢复训练
        saver.restore(sess, checkpoint)
        new_state = sess.run(model.initial_state)
        x = np.zeros((1, 1))
        w = word2int[start]

        # 不断生成字符，直到达到指定数目
        for i in range(length):
            x[0, 0] = w
            feed = {
                model.inputs: x,
                model.keep_prob: 1.,
                model.initial_state: new_state
            }
            preds, new_state = sess.run([model.prediction, model.final_state],
                                        feed_dict=feed)

            # idx = np.argmax(preds[0])

            w = pick_top_n(preds, len(vocab))
            content.append(int2word[w])

    return ''.join(content)

Пример #3

0

Показать файл

def save(data):
    for item in data:
        if not 'committee' in item: continue
        d = None
        if 'date' in item:
            d = item['date']
        elif 'end' in item:
            d = item['end']
        elif 'time' in item:
            d = item['time']

        if not isinstance(d, str):
            d = str(d)

        id = item['committee'] + d + str(item['seq_no'])
        item['id'] = id
        process(item,
                id,
                db.comagenda,
                'ep_comagendas',
                id + ' - ' + item['title'],
                onchanged=onchanged)
    return data

Пример #4

0

Показать файл

Файл: pvote.py Проект: parltrack/parltrack

def scrape(url, **kwargs):
    log(3,"scraping %s" % (url))
    root = getXML(url)
    if root is None:
        log(1,"could not get votes for", url)
        return # angrily o/
    log(3, "processing plenary votes xml from %s" % url)
    # root is:
    #PV.RollCallVoteResults EP.Number="PE 533.923" EP.Reference="P7_PV(2014)04-17" Sitting.Date="2014-04-17" Sitting.Identifier="1598443"
    votes=[]
    for vote in root.xpath('//RollCallVote.Result'):
        # hrmpf, some EP seriously used the braindead Y-d-m format sometimes in vote timestamps :/
        time = vote.get('Date')
        if len(time.split()) == 2:
            ts = datetime.strptime(time, "%Y-%m-%d %H:%M:%S")
        else:
            ts = datetime.strptime(time, "%Y-%m-%d")
        tmp=vote.get('Identifier')
        if tmp:
            voteid = int(tmp)
        else:
            tmp = vote.get('Number')
            if not tmp:
                log(1, "blimey, could not deduce an id for the vote in %s" % url)
                raise ValueError("no id for vote in %s" % url)
            voteid = "%s-%s" % (ts,tmp)
        title = vote.xpath("RollCallVote.Description.Text")
        if len(title) != 1:
            log(2, "holy ambiguity Batman! This vote doesn't have one title, but %d: %d %s" % (len(title), voteid, url))
            title="!unknown!"
        else:
            title=junws(title[0])
        v={u"ts": ts,
           u"url": url,
           u"voteid": voteid,
           u"title": title,
           'votes':{}}
        v.update(votemeta(v['title'], v['ts']))
        if 'epref' not in v:
            ref = vote.xpath("RollCallVote.Description.Text/a/text()")
            if ref:
                v['epref']=unws(ref[0])
        for type, stype in [('Result.For','+'), ('Result.Against','-'), ('Result.Abstention','0')]:
            type = vote.xpath(type)
            if not type: continue
            if len(type)>1:
                log(2, "[pff] more than one %s entry in vote (id:%d) in %s" % (stype, v['voteid'], url))
            type = type[0]
            v['votes'][stype]={'total': int(type.get('Number')),
                               'groups': {}}
            for group in type.xpath('Result.PoliticalGroup.List'):
                g = str(group.get('Identifier'))
                if not g in v['votes'][stype]['groups']:
                    v['votes'][stype]['groups'][g]=[]
                for tag in ['Member.Name', 'PoliticalGroup.Member.Name']:
                    for mep in group.xpath(tag):
                        m = {}
                        name = junws(mep)
                        mepid = mep.get("PersId")
                        if mepid:
                            mepid = int(mepid)
                        else:
                            mepid = db.getMep(name, v['ts'], abbr=g)
                        if mepid:
                            m['mepid']= mepid
                            #if int(mep.get('MepId')) in ambiguous_meps:
                            #    oid = int(mep.get('MepId'))
                            #    ambiguous_meps.remove(oid)
                            #    log(2,'found mepid for previously ambigous obscure_id: "%s": %s' % (oid, mepid))
                        else:
                            mepid = lost_meps.get(mep.get('MepId'))
                            if mepid:
                                m['mepid']= mepid
                            else:
                                m['name']= name
                                m['obscure_id']=int(mep.get('MepId'))  # it's a totally useless and confusing id that is nowhere else used
                        v['votes'][stype]['groups'][g].append(m)
        # save
        process(v, v['voteid'], db.vote, 'ep_votes', v['title'])
        votes.append(v)
    return votes

Пример #5

0

Показать файл

def scrape(id, terms, mepname, **kwargs):
    activity_types = (
        ('plenary-speeches', 'CRE'),
        ('reports', "REPORT"),
        ('reports-shadow', "REPORT-SHADOW"),
        ('opinions', "COMPARL"),
        ('opinions-shadow', "COMPARL-SHADOW"),
        ('motions-instit', "MOTION"),
        ('oral-questions', "OQ"),
        # other activities
        ('written-explanations', 'WEXP'),
        ('major-interpellations', 'MINT'),
        ('written-questions', "WQ"),
        ('motions-indiv', "IMOTION"),
        ('written-declarations', "WDECL"),
    )
    activities = {}
    for type, TYPE in activity_types:
        for term in terms:
            page = 0
            cnt = 20
            url = "http://www.europarl.europa.eu/meps/en/%s/loadmore-activities/%s/%s/?page=%s&count=%s" % (
                id, type, term, page, cnt)
            try:
                root = fetch(url)
            except:
                log(1, "failed to fetch {}".format(url))
                raise ValueError
                #continue
            #print(url, file=sys.stderr)
            while (len(root.xpath('//div[@class="erpl_document"]')) > 0):
                for node in root.xpath('//div[@class="erpl_document"]'):
                    if type == 'written-explanations':
                        item = {
                            'title':
                            unws(''.join(
                                node.xpath(
                                    './div/h3/span[@class="t-item"]//text()'))
                                 ),
                            'date':
                            datetime.strptime(
                                node.xpath('./div[1]/div[1]/span[1]/text()')
                                [0], u"%d-%m-%Y"),
                            'text':
                            unws(''.join(node.xpath('./div[2]/div//text()')))
                        }
                    elif type == 'written-declarations':
                        if len(node.xpath('./div[1]/div')) != 3:
                            log(
                                2,
                                "written decl item has not 3 divs but %d %s" %
                                (len(node.xpath('./div[1]/div')), url))
                            continue
                        if len(node.xpath('./div[1]/div[1]/span')) != 3:
                            log(
                                2,
                                "written decl item has not 3 but %d spans in the 1st div at %s"
                                %
                                (len(node.xpath('./div[1]/div[1]/span')), url))
                            continue

                        item = {
                            'title':
                            unws(''.join(
                                node.xpath(
                                    './div/h3/span[@class="t-item"]//text()'))
                                 ),
                            'date':
                            datetime.strptime(
                                node.xpath('./div[1]/div[1]/span[1]/text()')
                                [0], u"%d-%m-%Y"),
                            'id':
                            unws(''.join(
                                node.xpath('./div[1]/div[1]/span[2]/text()')
                                [0])),
                            'status':
                            unws(''.join(
                                node.xpath('./div[1]/div[1]/span[3]/text()')
                                [0])),
                            'formats': [{
                                'type':
                                unws(fnode.xpath('./span/text()')[0]),
                                'url':
                                str(fnode.xpath('./@href')[0]),
                                'size':
                                unws(fnode.xpath('./span/span/text()')[0])
                            } for fnode in node.xpath(
                                './div[1]/div[2]/div[@class="d-inline"]/a')],
                            'authors': [{
                                'name': name.strip(),
                                "mepid": db.mepid_by_name(name.strip())
                            } for name in node.xpath(
                                './div[1]/div[3]/span/text()')],
                        }
                        for info in node.xpath('./div[2]/div'):
                            label = unws(''.join(info.xpath('./text()')))[:-2]
                            value = unws(''.join(info.xpath('./span/text()')))
                            if 'date' in label.lower():
                                value = datetime.strptime(value, u"%d-%m-%Y")
                            if label == 'Number of signatories':
                                number, date = value.split(' - ')
                                value = int(number)
                                item["No of sigs date"] = datetime.strptime(
                                    date, u"%d-%m-%Y")
                            item[label] = value
                    else:
                        #from lxml.etree import tostring
                        #print('\n'.join(tostring(e).decode() for e in node.xpath('./div/div[1]')))
                        # all other activities share the following scraper
                        ref = unws(''.join(
                            node.xpath('./div[1]/div[1]/span[2]/text()')))

                        if ref.startswith('- '):
                            ref = ref[2:]
                        if ref.endswith(' -'):
                            ref = ref[:-2]

                        item = {
                            'date':
                            datetime.strptime(
                                node.xpath('./div[1]/div[1]/span[1]/text()')
                                [0], u"%d-%m-%Y"),
                            'reference':
                            ref,
                        }

                        if type not in ['written-questions', 'oral-questions']:
                            ref = unws(''.join(
                                node.xpath('./div[1]/div[1]/span[3]/text()')))
                            if ref:
                                if not pere.match(ref):
                                    log(
                                        2,
                                        "pe, has not expected format: '%s'" %
                                        ref)
                                else:
                                    item['pe'] = ref

                        # opinions don't have title urls... why would they?
                        refurl = node.xpath('./div[1]/h3/a/@href')
                        if refurl: item['url'] = str(refurl[0])

                        item['title'] = unws(''.join(
                            node.xpath(
                                './div/h3//span[@class="t-item"]//text()')))

                        abbr = node.xpath(
                            './div[1]/div[1]/span/span[contains(concat(" ",normalize-space(@class)," ")," erpl_badge-committee ")]/text()'
                        )
                        if len(abbr):
                            item['committee'] = [
                                a for a in [unws(c) for c in abbr] if a
                            ]

                        formats = []
                        for fnode in node.xpath(
                                './div[1]/div[2]/div[@class="d-inline"]/a'):
                            elem = {
                                'type': unws(fnode.xpath('./span/text()')[0]),
                                'url': str(fnode.xpath('./@href')[0])
                            }
                            tmp = fnode.xpath('./span/span/text()')
                            if len(tmp) > 0:
                                elem['size'] = unws(tmp[0])
                            formats.append(elem)
                        if formats:
                            item['formats'] = formats

                        authors = [{
                            'name': name.strip(),
                            "mepid": db.mepid_by_name(name.strip())
                        } for name in node.xpath('./div[1]/div[3]/span/text()')
                                   ]
                        if authors: item['authors'] = authors

                        if type in ['opinions-shadow', 'opinions']:
                            for f in item['formats']:
                                if f['type'] == 'PDF':
                                    ref = pdf2ref(f['url'])
                                    if ref is not None:
                                        item['dossiers'] = [ref]
                                    break
                        else:
                            # try to deduce dossier from document reference
                            dossiers = db.get('dossiers_by_doc',
                                              item['reference']) or []
                            if len(dossiers) > 0:
                                item['dossiers'] = [
                                    d['procedure']['reference']
                                    for d in dossiers
                                ]
                            elif not '+DOC+PDF+' in item['url']:
                                # try to figure out the associated dossier by making an (expensive) http request to the ep
                                log(
                                    4, "fetching primary activity page %s" %
                                    item['url'])
                                try:
                                    refroot = fetch(item['url'])
                                except:
                                    refroot = None
                                if refroot is not None:
                                    if '/doceo/' in item[
                                            'url']:  # stupid new EP site removed the span with the procedure, bastards.
                                        fulla = refroot.xpath(
                                            '//table[@class="buttondocwin"]//a/img[@src="/doceo/data/img/navi_moredetails.gif"]/..'
                                        )
                                        if fulla:
                                            fullurl = fulla[0].get('href')
                                            if fullurl.endswith('.html'):
                                                if fullurl[-7:-5] != 'EN':
                                                    fullurl = fullurl[:-7] + 'EN.html'
                                                log(
                                                    4,
                                                    'loading activity full text page %s'
                                                    % fullurl)
                                                if fullurl.startswith(
                                                        '/doceo'):
                                                    fullurl = 'https://www.europarl.europa.eu' + fullurl
                                                if fullurl != item['url']:
                                                    refroot = fetch(fullurl)
                                        else:
                                            log(
                                                4, 'no fulla for %s' %
                                                item['url'])
                                    anchor = refroot.xpath(
                                        '//span[@class="contents" and text()="Procedure : " and not(ancestor::div[@style="display:none"])]'
                                    )
                                    if len(anchor) == 1:
                                        dossier = anchor[0].xpath(
                                            "./following-sibling::a/text()")
                                        if len(dossier) == 1:
                                            item['dossiers'] = [
                                                unws(dossier[0])
                                            ]
                                        elif len(dossier) > 1:
                                            log(
                                                2,
                                                "more than one dossier in ep info page: %d %s"
                                                % (len(dossier), item['url']))
                                    elif len(anchor) > 1:
                                        log(
                                            2,
                                            "more than one anchor in ep info page: %d %s"
                                            % (len(anchor), item['url']))

                    item['term'] = term
                    if TYPE not in activities:
                        activities[TYPE] = []
                    activities[TYPE].append(item)
                if len(root.xpath('//div[@class="erpl_document"]')) < cnt:
                    break
                page += 1
                url = "http://www.europarl.europa.eu/meps/en/%s/loadmore-activities/%s/%s/?page=%s&count=%s" % (
                    id, type, term, page, cnt)
                try:
                    root = fetch(url)
                except:
                    log(1, "failed to fetch {}".format(url))
                    #raise ValueError
                    break
                #print(url, file=sys.stderr)
        if TYPE in activities:
            activities[TYPE] = sorted(activities[TYPE],
                                      key=lambda x: x['date'])
    activities['mep_id'] = id
    if len(activities.keys()) > 1:
        process(activities,
                id,
                db.activities,
                'ep_mep_activities',
                mepname,
                nodiff=True)
        return activities
    return {}

Пример #6

0

Показать файл

def train(batch_size=10, seq_len=150, epochs=200):
    if not os.path.exists(os.path.dirname(FLAGS.checkpoints_dir)):
        os.mkdir(os.path.dirname(FLAGS.checkpoints_dir))
    if not os.path.exists(FLAGS.checkpoints_dir):
        os.mkdir(FLAGS.checkpoints_dir)

    data, word2int, int2word, vocab = process(FLAGS.file_path)
    with open("./output/vocabularies.txt", "w+") as f:
        f.write(str(vocab))

    model = WordRNN(len(vocab),
                    batch_size=batch_size,
                    seq_len=seq_len,
                    lstm_size=lstm_size,
                    layer_count=layer_count,
                    learning_rate=learning_rate)

    saver = tf.train.Saver(max_to_keep=100)

    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())

        start_epoch = 0
        checkpoint = tf.train.latest_checkpoint('./checkpoints')
        if checkpoint:
            saver.restore(sess, checkpoint)
            print(checkpoint)
            print("[%s] 从checkpoints中恢复继续训练 {0}".format(checkpoint) %
                  strdatetime())
            pattern = re.compile("\./checkpoints/(\d+).*")
            start_epoch += int(re.match(pattern, checkpoint).group(1))

        print('[%s] 开始训练...' % strdatetime())
        for e in range(start_epoch, epochs):
            print("[%s]--------- 第%d轮(共%d轮) --------" %
                  (strdatetime(), e + 1, epochs))
            # Train network
            new_state = sess.run(model.initial_state)

            batch = 0
            batch_count = int(len(data) / (batch_size * seq_len))
            print("共计 %d 词语单元, %d 批次" % (len(data), batch_count))
            for x, y in generate_batch(data, batch_size, seq_len):
                batch += 1
                start = time.time()
                feed = {
                    model.inputs: x,
                    model.targets: y,
                    model.keep_prob: keep_prob,
                    model.initial_state: new_state
                }
                loss, new_state, _ = sess.run(
                    [model.loss, model.final_state, model.optimizer],
                    feed_dict=feed)

                end = time.time()
                # control the print lines
                # if counter % 100 == 0:
                print('[%s] 批次: %d , 时间: %.6fs, 误差: %.6f' %
                      (strdatetime(), batch, end - start, loss))

                if ((e + 1) % save_freq == 0
                        and (batch == batch_count or batch == 1
                             or batch == int(batch_count / 2))):
                    saver.save(
                        sess, "checkpoints/{}-{}-{}.ckpt".format(
                            e + 1, batch, lstm_size))

Пример #7

0

Показать файл

Файл: amendment.py Проект: parltrack/parltrack

def scrape(url, meps=None, **kwargs):
    prolog=True
    res=[]
    block=None
    reference=None
    date=None
    committee=[]
    text, PE=getraw(url)
    motion = False
    for line in text:
        #log(4,'line is: "%s"' % line)
        if prolog:
            line=unws(line)
            if not line: continue

            if amstart.match(line):
                if PE is None:
                    log(1, "document has no PE id: %s" % url)
                if reference==None:
                    log(1,"[!] couldn't find ref: %s" % (unws([x for x in text[:20] if unws(x)][2])))
                    # marking as scraped though
                    if not motion:
                        log(1, "couldn't find dossier reference in source pdf: %s" % url)
                        #raise ValueError("No dossier reference in amendment: %s" % url)
                        return
                    log(3, "couldn't find dossier reference in source pdf, but was marked as motion: %s" % url)
                    return
                if date==None or committee==[]:
                    log(1,"[!] couldn't find date or committee: %s" % url)
                    raise ValueError("No date or committee in amendment")
                block=[line]
                prolog=False
                continue

            if line == 'Draft motion for a resolution': 
                log(4,"document is a draft motion for resolution")
                motion = True

            m = re.search(pere, line)
            if m:
                if PE is None: PE = m.group(0)
                log(4,"found PE reference: %s" % PE)
                line = unws(line.replace(PE,''))
                log(4,'updated line is: "%s"' % line)

            if line in COMMITTEE_MAP:
                log(4,'found committee: "%s"' % line)
                committee.append(COMMITTEE_MAP[line])
                continue

            m = re.search(refre, line)
            if (committee and not reference and m):
                reference=m.group(1)
                log(4,'found reference: "%s"' % reference)
                if url == 'http://www.europarl.europa.eu/sides/getDoc.do?pubRef=-%2f%2fEP%2f%2fNONSGML%2bCOMPARL%2bPE-506.166%2b03%2bDOC%2bPDF%2bV0%2f%2fEN':
                    log(3, "adjusting reference to eudatap")
                    reference="2012/0011(COD)"
                continue

            if (not date):
                try:
                    date = parse(unws(line), dayfirst=True)
                    log(4,'found date: "%s"' % line)
                except ValueError:
                    pass
                except TypeError:
                    pass
            continue

        if amstart.match(line):
            # parse block
            am=parse_block(block, url, reference, date, committee, meps, PE)
            if am is not None:
                process(am, am['id'], db.amendment, 'ep_amendments', am['reference']+' '+am['id'], nodiff=True)
                res.append(am)
            block=[line]
            continue
        block.append(line)
    if block and filter(None,block):
        am = parse_block(block, url, reference, date, committee, meps, PE)
        if am is not None:
            process(am, am['id'], db.amendment, 'ep_amendments', am['reference']+' '+am['id'], nodiff=True)
            res.append(am)
    log(3,"total amendments %d in %s" % (len(res),url))
    return res

Пример #8

0

Показать файл

 def process_feature(data, transform=lambda x: x, policy=lambda x: True):
     return process(data, users['msno'], indices, transform, policy)

Пример #9

0

Показать файл

 def process_feature(data, transform=lambda x: x, policy=lambda x: True):
     return process(data, songs['song_id'], indices, transform, policy)

Пример #10

0

Показать файл

def scrape(id, **kwargs):
    # we ignore the /meps/en/<id>/<name>/home path, since we can get all info also from other pages
    url = "http://www.europarl.europa.eu/meps/en/%s/name/cv" % id
    xml = fetch_raw(url)  # we have to patch up the returned html...
    xml = xml.replace("</br>", "<br/>")  # ...it contains some bad tags..
    root = fromstring(
        xml
    )  # ...which make the lxml soup parser drop some branches in the DOM
    sidebar_check(root, url)

    mep = {
        'UserID':
        id,
        'Name':
        mangleName(
            unws(' '.join(
                root.xpath('//span[@class="sln-member-name"]/text()'))), id),
        'Photo':
        "https://www.europarl.europa.eu/mepphoto/%s.jpg" % id,
        'meta': {
            'url': url
        },
        'Twitter': [
            unws(x.replace("http:// ", "")) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="Twitter"]/@href'
            )
        ],
        'Homepage': [
            unws(x.replace("http:// ", "")) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="Website"]/@href'
            )
        ],
        'Facebook': [
            unws(x.replace("http:// ", "")) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="Facebook"]/@href'
            )
        ],
        'Instagram': [
            unws(x.replace("http:// ", "")) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="Instagram"]/@href'
            )
        ],
        'Mail': [
            deobfus_mail(x) for x in root.xpath(
                '//section[@id="presentationmep"]//a[@data-original-title="E-mail"]/@href'
            )
        ],
        'Addresses':
        parse_addr(root),
        'active':
        False,
    }

    mep = addchangednames(mep)

    birthdate = root.xpath('//time[@id="birthDate"]/text()')
    if len(birthdate) > 0:
        mep['Birth'] = {
            'date': datetime.strptime(unws(birthdate[0]), u"%d-%m-%Y")
        }
        place = root.xpath('//time[@id="birthDate"]/following-sibling::text()')
        if len(place) > 0:
            tmp = unws(' '.join(place))
            if tmp.startswith(", "): tmp = tmp[2:]
            mep['Birth']['place'] = tmp

    death = root.xpath('//time[@id="deathDate"]/text()')
    if death:
        mep['Death'] = datetime.strptime(unws(death[0]), u"%d-%m-%Y")

    body = root.xpath(
        '//span[@id="detailedcardmep"]/following-sibling::section')[0]

    if body.xpath('.//h1[text()="Curriculum vitae "]'):
        if not body.xpath('.//h3[@id="no_cv_available"]'):
            mep['CV'] = {
                'updated':
                datetime.strptime(
                    unws(
                        body.xpath(
                            './/p[@class="small"]/strong[contains(text(),"Updated: ")]/text()'
                        )[0]), u"Updated: %d/%m/%Y")
            }
            mep['CV'].update({
                unws(''.join(title.xpath(".//text()"))): [
                    unws(''.join(item.xpath(".//text()"))).replace(
                        "-...", "- ...")
                    for item in title.xpath("following-sibling::ul/li")
                ]
                for title in body.xpath('.//h4')
                #if not unws(''.join(title.xpath(".//text()"))).startswith("Original version : ")
            })

    # assistants
    url = "http://www.europarl.europa.eu/meps/en/%s/name/assistants" % id
    root = fetch(url)
    body = root.xpath(
        '//span[@id="detailedcardmep"]/following-sibling::section')[0]
    if unws(' '.join(body.xpath(".//h1/text()"))) == "Assistants":
        for h4 in body.xpath('.//h4'):
            title = unws(''.join(h4.xpath(".//text()")))
            assistants = [
                unws(''.join(item.xpath(".//text()")))
                for item in h4.xpath("../div//span")
            ]
            if title in ['Accredited assistants', 'Local assistants']:
                if not 'assistants' in mep: mep['assistants'] = {}
                title = title.lower().split()[0]
                if assistants: mep['assistants'][title] = assistants
            elif title in [
                    'Accredited assistants (grouping)',
                    'Local assistants (grouping)', 'Service providers',
                    'Trainees', 'Paying agents (grouping)', 'Paying agents',
                    'Assistants to the Vice-Presidency/to the Quaestorate'
            ]:
                if not 'assistants' in mep: mep['assistants'] = {}
                title = title.lower()
                if assistants: mep['assistants'][title] = assistants
            else:
                log(2,
                    'unknown title for assistants "{}" {}'.format(title, url))
                raise ValueError

    # declarations
    root = fetch("http://www.europarl.europa.eu/meps/en/%s/name/declarations" %
                 id)
    body = root.xpath(
        '//span[@id="detailedcardmep"]/following-sibling::section')[0]
    if unws(' '.join(body.xpath(".//h1/text()"))) == "Declarations":
        for title in body.xpath('.//h4'):
            key = unws(''.join(title.xpath('.//text()')))
            if key == 'Declaration of financial interests':
                key = 'Financial Declarations'
                mep[key] = []
                for pdf in title.xpath('./following-sibling::ul/li/a'):
                    url = pdf.xpath('./@href')[0]
                    try:
                        mep[key].append(findecl.scrape(url))
                    except:
                        log(1, "failed to extract findecl from %s" % url)
            elif key == 'Declarations of participation by Members in events organised by third parties':
                key = 'Declarations of Participation'
                mep[key] = []
                for pdf in title.xpath(
                        './following-sibling::ul/li/a'
                )[::
                  -1]:  # reversed order, otherwise newer ones get prepended and mess up the diff
                    url = pdf.xpath('./@href')[0]
                    name = unws(''.join(pdf.xpath('.//text()')))
                    mep[key].append({'title': name, 'url': url})
            elif key in [
                    'Declaration of good conduct',
                    'Voluntary confirmation on the use of the General Expenditure Allowance'
            ]:
                mep[key] = []
                for pdf in title.xpath(
                        './following-sibling::ul/li/a'
                )[::
                  -1]:  # reversed order, otherwise newer ones get prepended and mess up the diff
                    url = pdf.xpath('./@href')[0]
                    name = unws(''.join(pdf.xpath('.//text()')))
                    mep[key].append({'title': name, 'url': url})
            else:
                log(
                    2,
                    'unknown type of declaration: "%s" http://www.europarl.europa.eu/meps/en/%s/name/declarations'
                    % (key, id))
                key = None
                raise ValueError

    # history
    parse_history(id, root, mep)
    process(mep,
            id,
            db.mep,
            'ep_meps',
            mep['Name']['full'],
            nopreserve=(['Addresses'], ['assistants']),
            onchanged=onchanged)

    if __name__ == '__main__':
        return mep
    del mep

Пример #11

0

Показать файл

#    You should have received a copy of the GNU Affero General Public License
#    along with parltrack  If not, see <http://www.gnu.org/licenses/>.

# (C) 2019 by Stefan Marsiske, <*****@*****.**>, Asciimoo

from db import db
from utils.process import process
import requests

if __name__ == "__main__":
    csv = requests.get(
        'https://github.com/TechToThePeople/mep/raw/production/data/meps.nogender.csv'
    ).text
    genders = [l.split(',')[:2] for l in csv.split('\n')][1:-1]
    try:
        for mepid, gender in genders:
            mep = db.mep(int(mepid))
            if not mep:
                print("meeeeeheeheheh", mepid)
                continue
            mep['Gender'] = gender
            process(mep,
                    int(mepid),
                    db.mep,
                    'ep_meps',
                    mep['Name']['full'],
                    nopreserve=(['Addresses'], ['assistants']))
    finally:
        db.commit('ep_meps')

Python process примеры использования