Python extract_url示例，utils.extract_url Python示例

示例#1

0

显示文件

文件： listservs.py 项目： jsta/limnojobs

def pull_msg_content_ecolog(msg_raw):
    re_subject = re.compile('(?<=Subject: \[ECOLOG-L\] ).*(?=List-Subscribe)')
    subject = re_subject.findall(msg_raw.get_payload())
    subject = [x.replace("\\r\\n", "") for x in subject]
    if (len(subject) == 0):
        subject = ["error"]
    # print(subject)

    re_body = re.compile(
        '(?<=Content-Type: text\\/plain; charset="us-ascii"\\\\r\\\\nContent-Transfer-Encoding: quoted-printable).*?(?=Manage your Group settings)'
    )
    body = re_body.findall(msg_raw.get_payload())
    if len(body) == 0:
        re_body = re.compile(
            '(?<=Content-Type: text\\/plain; charset="iso-8859-1"\\\\r\\\\nContent-Transfer-Encoding: ).*?(?=Manage your Group settings)'
        )
        body = re_body.findall(msg_raw.get_payload())
    if len(body) == 0:
        re_body = re.compile(
            '(?<=Content-Type: text\\/plain; charset="UTF-8"\\\\r\\\\nContent-Transfer-Encoding: quoted-printable).*?(?=Manage your Group settings)'
        )
        body = re_body.findall(msg_raw.get_payload())
    if len(body) == 0:
        re_body = re.compile(
            '(?<=Content-Type: text\\/plain; charset="utf-8"\\\\r\\\\nContent-Transfer-Encoding: quoted-printable).*?(?=Manage your Group settings)'
        )
        body = re_body.findall(msg_raw.get_payload())
    if len(body) == 0:
        re_body = re.compile(
            '(?<=Content-Type: text\\/plain; charset="UTF-8"\\\\r\\\\nContent-Transfer-Encoding: ).*?(?=Manage your Group settings)'
        )
        body = re_body.findall(msg_raw.get_payload())
    if len(body) == 0:
        re_body = re.compile(
            '(?<=Content-Type: text\\/plain; charset="Windows-1252"\\\\r\\\\nContent-Transfer-Encoding: ).*?(?=Manage your Group settings)'
        )
        body = re_body.findall(msg_raw.get_payload())
    if len(body) == 0:
        body = "error"

    url = utils.extract_url(body)

    # some urls have an equal sign...
    body = [x.replace("=", "") for x in body]
    body = [x.replace("~", "") for x in body]
    body = [x.replace("--", "") for x in body]

    return [subject, body, url]

示例#2

0

显示文件

文件： actions.py 项目： skolj/mldetector

def submit(link):
    url = extract_url(link)
    process_test_url(url, 'test_features.csv')
    return_ans = tr.gui_caller('url_features.csv', 'test_features.csv')
    a = str(return_ans).split()
    if int(a[1]) == 0:
        return Results.SAFE
        # answer = tkMessageBox.askquestion("Redirect","Do you want to visit the url?")
        # if answer == 'yes':
        #         webbrowser.open(url=E1.get(), new=1)
    elif int(a[1]) == 1:
        return Results.MALICIOUS
        # tkMessageBox.showinfo("URL Checker Result", "The URL " + url + " is Malicious")
        # answer_2 = tkMessageBox.askquestion("Redirect", "The url MALICIOUS, Do you still want to visit the url?")
        # if answer_2=='yes':
        #     webbrowser.open(url=E1.get(),new=1)
    else:
        # tkMessageBox.showinfo("URL Checker Result", "The URL " + url + " is Malware")
        # tkMessageBox.showwarning("Warning","Cant Redirect, url contains a malware")
        return Results.MALWARE

示例#3

0

显示文件

def get_url_and_display_variant(update, context):
    user = context.chat_data['user']

    search_url = utils.extract_url(update, context)
    if search_url is not None:
        logger.info("Bot extracted URL: %s", search_url)
        channel = utils.extract_domain(search_url)
        if channel in SUPPORTED_CHANNELS:
            update.message.reply_text(f"Brb! I'm learning more about this product on {channel}.")
            item_dict, variants_dict, variants_display_dict = utils.get_item_information(channel, search_url)
            update.message.reply_markdown(f"Hurray! Ive found \n\n{item_dict['item_name']}\n\n"
                                          'Which of these product variations would you like to track?',
                                          reply_markup=ReplyKeyboardMarkup.from_column(variants_display_dict,
                                                                                       one_time_keyboard=True))
            logger.info(f"BOT: prompted {user.first_name} for variant choice")

            # Store in context
            context_store_item(item_dict, context)
            context.chat_data['item_url'] = utils.shorten_url([search_url])[0]
            context.chat_data['channel'] = utils.extract_domain(search_url)
            context.chat_data['variants'] = variants_dict
            logger.info(context.chat_data['variants'])
            context.chat_data['variants_displayed'] = variants_display_dict
            # context.chat_data['item'] = item_dict
            logger.info(f"CONTEXT: Stored channel, variants, display, url for item {item_dict['item_name']}")

            return CHOOSE_THRESHOLD

        else:
            update.message.reply_text(f"Oops, I do not support {channel} yet. Let's try again.",
                                      reply_markup=ReplyKeyboardMarkup(start_reply_keyboard, one_time_keyboard=True))
            return INITIAL_CHOICE
    else:
        update.message.reply_text("Oops, you did not key in a valid URL. Let's try again.",
                                  reply_markup=ReplyKeyboardMarkup(start_reply_keyboard, one_time_keyboard=True))
        return INITIAL_CHOICE

示例#4

0

显示文件

 def in_white_list(url):
     domain, _ = extract_url(url)
     for d in ds:
         if d in domain:
             return True
     return False

示例#5

0

显示文件

 def log(self, js_url, page_url):
     domain, uri = extract_url(page_url)
     d = self.getDomain(domain)
     page = d.getPage(domain+uri)
     page.logjs(js_url)

示例#6

0

显示文件

文件： parser.py 项目： baitcode/kickstarter-parser

def to_absolute_url(url):
    ''' Converts urls like "/discover/" to "http://www.kickstarter.com/discover/"
    '''
    return extract_url('{0}{1}'.format(ROOT_URL, url))

示例#7

0

显示文件

文件： title.py 项目： sanxiyn/sandbox

with open('password') as f:
    password = f.read()

import sys
args = sys.argv[1:]
if len(args) != 1:
    print 'Usage: title.py folder'
    sys.exit()
folder, = args

context = imapclient.create_default_context()
context.verify_mode = ssl.CERT_NONE
imap = IMAPClient(host, ssl=True,  ssl_context=context)
imap.login(username, password)
imap.select_folder(folder)
msgids = imap.search()
response = imap.fetch(msgids, ['BODY.PEEK[]'])
messages = []
for msgid in msgids:
    header = response[msgid]['BODY[]']
    message = email.message_from_string(header)
    messages.append(message)

counter = collections.Counter()
pool = multiprocessing.Pool()
urls = [utils.extract_url(message) for message in messages]
counter.update(pool.imap(utils.title_of_url, urls))
for title, count in counter.most_common():
    print title, count

示例#8

0

显示文件

文件： test_sample.py 项目： jsta/limnojobs

def test_extract_url():
    assert utils.extract_url(['https://asdf.dd']) == 'https://asdf.dd'

示例#9

0

显示文件

文件： ohmydl.py 项目： MukundVarmaT/oh-my-dl

def update():
    try:
        db = utils.load_pickle(DB_PATH)
        last_update = sorted(db['date'])[-1]
    except:
        utils.download(DB_URL, DB_PATH)
        db = utils.load_pickle(DB_PATH)
        last_update = sorted(db['date'])[-1]

    # query arxiv api
    n_added = 0
    indx = 0

    while indx < MAX_ITER:

        url = BASE_URL + QUERY_FMT.format(DEF_QUERY, indx, RESULTS_PER_ITER)
        try:
            with urllib.request.urlopen(url, timeout=5.0) as url:
                response = url.read()
        except TimeoutError:
            continue
        response = feedparser.parse(response)

        for entry in response.entries:
            e = utils.encode_feedparser_dict(entry)
            paper_url = utils.parse_arxiv_url(e["link"])
            date = e["published"]
            date = utils.convert_to_datetime(date)

            # content already in database
            if paper_url in db["url"]:
                if date <= last_update:
                    indx = MAX_ITER
                    break
                else:
                    continue

            # retrieve and clean some text
            title = e["title"]
            title = utils.rem_tex_fmt(title)
            authors = ", ".join(f"{n['name']}" for n in e["authors"])
            abstract = e["summary"]
            abstract = utils.rem_tex_fmt(abstract)
            other_urls = utils.extract_url(abstract)
            journal = e["arxiv_journal_ref"] if "arxiv_journal_ref" in e else ""
            journal = utils.rem_tex_fmt(journal)

            db["date"].append(date)
            db["url"].append(paper_url)
            db["title"].append(title)
            db["authors"].append(authors)
            db["abstract"].append(abstract)
            db["journal"].append(journal)
            db["other_urls"].append(other_urls)
            n_added += 1

        if len(response.entries) == 0:
            utils.progress_bar(indx / MAX_ITER,
                               status="API not responding. retrying...")
        if indx == MAX_ITER:
            utils.progress_bar(1)
        else:
            indx += 100
            utils.progress_bar(indx / MAX_ITER,
                               status=f"Fetching papers from {date}...")
        time.sleep(WAIT_TIME)
    print(f"{n_added} papers added to database")

    if True:
        indx = list(np.argsort(db["date"]))
        db["date"] = list(np.array(db["date"])[indx])
        db["url"] = list(np.array(db["url"])[indx])
        db["title"] = list(np.array(db["title"])[indx])
        db["authors"] = list(np.array(db["authors"])[indx])
        db["abstract"] = list(np.array(db["abstract"])[indx])
        db["journal"] = list(np.array(db["journal"])[indx])
        db["other_urls"] = list(np.array(db["other_urls"])[indx])
        utils.save_pickle(DB_PATH, db)

        tkn_corpus = []
        for indx in range(len(db["url"])):
            title = db["title"][indx].lower()
            abstract = utils.filter_abstract(db["abstract"][indx].lower())
            tkn_corpus.append((title + " " + abstract).split(" "))
        bm25 = BM25Okapi(tkn_corpus)
        utils.save_pickle(CACHE_BM25, bm25)

示例#10

0

显示文件

def save_content(url, title, content, evn):
    domain, uri = extract_url(url)
    d = Domain(domain, evn)
    d.update_content(uri, title, content)

示例#11

0

显示文件

def index():
    form = Input(request.form)

    if request.method == 'POST' and form.validate():
        url = form.url.data
        base_url = extract_url(url)
        domain_info = getDomainInfo(base_url)['WhoisRecord']
        useful_domain_info = {}
        empty = False

        try:
            del domain_info['registrant']['rawText']
            domain_info['registrant']['street'] = domain_info[
                'registrant'].pop('street1')

            useful_domain_info = {
                'Registrar Name': domain_info['registrarName'],
                'Registrant Details': domain_info['registrant'],
                'Creation Date': domain_info['createdDate'],
                'Updation Date': domain_info['updatedDate'],
                'Expiration Date': domain_info['expiresDate'],
                'Domain Name': base_url
            }

        except KeyError as e:
            if e.message == 'registrant':
                print 'REGISTRANT-------------------------------------------'
                useful_domain_info = {}
                empty = True
            elif e.message == 'street1':
                print 'STREET-------------------------------------------'
                useful_domain_info = {
                    'Registrar Name': domain_info['registrarName'],
                    'Registrant Details': domain_info['registrant'],
                    'Creation Date': domain_info['createdDate'],
                    'Updation Date': domain_info['updatedDate'],
                    'Expiration Date': domain_info['expiresDate'],
                    'Domain Name': base_url,
                }
                empty = False
            else:
                print('---------------ELSE------')
                print e.message
        except Exception as e:
            print('-----------EXCEPTION-------------')
            print(e.message)

        result = submit(url)
        verdict = ''

        if result == Results.SAFE:
            verdict = 'SAFE'
        elif result == Results.MALICIOUS:
            verdict = 'MALICIOUS'
        else:
            verdict = 'MALWARE'

        if db.session.query(Store).filter(Store.url == base_url).count() == 0:
            info = Store(base_url, verdict)
            db.session.add(info)
            db.session.commit()

        if result == Results.SAFE:
            print('----------- SAFE -----------')
            return render_template('safe.html',
                                   url=url,
                                   base_url=extract_url(url),
                                   info=useful_domain_info,
                                   isempty=empty)
        elif result == Results.MALICIOUS or result == Results.MALWARE:
            print('----------- MALICIOUS -----------')
            return render_template('malicious.html',
                                   url=url,
                                   base_url=extract_url(url),
                                   info=useful_domain_info,
                                   empty=empty)

    return render_template('index.html', form=form)