Пример #1
0
def recheck(code,file):
    if os.path.getsize(file) <= 15000:
        txt_path = './txt/' + file[6:len(file)-4] + '.txt'
        cmd = 'node extract_text.js ' +  file + " " + txt_path
        print(cmd)
        os.system(cmd)
        if os.path.exists(txt_path) == False:
            spider.search(code)
Пример #2
0
def eventloop(sender=ConsoleSender()):
    urls = input('URLs(separate with semicolons):').split('; ')
    wordinput = input('Keywords(separate with semicolons):')

    pattern = []
    for words in wordinput.split(';'):
        if not words.strip():
            continue
        subpattern = []
        for word in words.strip().split():
            if not word:
                continue
            # rid '-'
            neg = word[0] == '-'
            if neg:
                word = word[1:]
            lo = analyzer.Contains(word)
            if neg:
                lo = analyzer.NegPattern(lo)
            subpattern.append(lo)
        # print(subpattern)
        pattern.append(analyzer.AndPattern(*subpattern))
    pattern = analyzer.OrPattern(*pattern)
    pattern = analyzer.filter_tag('title')(pattern)

    send_urls = set()
    send_lines = []

    def handler(url, document):
        res = pattern.search(document)
        print(res)
        if res:
            bs = bs4.BeautifulSoup(document, "html5lib")
            title = bs.find('title').get_text()
            if url not in send_urls:
                send_urls.add(url)
                send_lines.append('{}: {}'.format(title, url))

    try:
        while True:
            for url in urls:
                search(url, 2, handler, html_rendering=False)
            if (send_lines):
                sender.send(
                    'Topic Alert',
                    """The messages you subscribed are found here:
{}""".format('\n'.join(send_lines)))
                send_lines = []
                print("Sent")
            sleep_secs = 900
            print(f'Next scan in {sleep_secs} seconds')
            time.sleep(sleep_secs)
    except KeyboardInterrupt:
        print("Stopped")
Пример #3
0
async def on_message(message):
    if message.content == "H":
        emb = discord.Embed()
        emb.set_image(url="https://i.stack.imgur.com/LDUmI.png")
        await message.channel.send(content=None, embed=emb)
    print(message.content)

    target = spider.search(message.content)
    if target != "NOT FOUND" and message.author.nick != "TEST_LOL":
        img = spider.web_crawl(target)
        for text in img:
            await message.channel.send(content=text)
Пример #4
0
def calculate_simple(w_sheet, r_sheet, r):
    print(r)
    for index in range(3):
        w_sheet.cell(row=r + 1, column=index + 1).value = r_sheet.cell(
            row=r + 1, column=index + 1).value

    compony = r_sheet.cell(row=r + 1, column=3).value
    name = r_sheet.cell(row=r + 1, column=2).value
    ratio = r_sheet.cell(row=r + 1, column=6).value

    if ratio:
        print('pass')
        return
    if not compony or not name:
        return

    result = {}
    imagePath = ''
    try:
        result = spider.search(compony, name)
        if not result:
            return
        w_sheet.cell(row=r + 1, column=4).value = result['SCO_NAME']
        w_sheet.cell(row=r + 1, column=5).value = result['ECO_NAME']

        personID = result['personID']
        if not personID:
            return
        imagePath = spider.getImagePath(personID)
        if not imagePath:
            return
    except BaseException as e:
        print(e)
        if str(e) == 'string indices must be integers':
            print("sleep")
            return

    if not result or not imagePath:
        return

    imagename = compony + '_' + name + '_' + result['SCO_NAME'] + '_' + result['CER_NUM'] \
                + '_' + result['PTI_NAME'] + '_' + result['ECO_NAME'] + '_' + \
                result['PPP_GET_DATE'] + '_' + result['PPP_END_DATE']
    try:
        ratio = get_fwhr(imagePath, url=True, show=False, imagename=imagename)
        if ratio:
            w_sheet.cell(row=r + 1, column=6).value = ratio
            print('ok: {}'.format(ratio))
    except BaseException as exct:
        print(exct)
        print(result)
        print(imagePath)
Пример #5
0
def checkfile(file, size=7, threshold=0.1):
    res = {'input': file, 'keywords': [], 'results': []}

    # read file fulltext
    text = readfile(file)
    if text is None:
        print('This file appears to be invalid')
        return res

    # get keywords
    keywords = getkeywords(text)
    print('Keywords: ', ', '.join(keywords))
    res['keywords'] = keywords

    # search by keywords
    results = search(keywords).results()

    if not results:
        print('No result.')
        return res

    # processing files
    for i, result in enumerate(results):
        try:
            print('\nProcessing file {0}: {1} ({2})'.format(
                str(i + 1), result['name'], result['url']))

            start = timeit.default_timer()

            text2 = readfile(result['url'])
            if text2 is None:
                print('This file appears to be invalid')
                continue

            # get keywords
            keywords2 = getkeywords(text2)
            print('Keywords: ', ', '.join(keywords2))

            # compare segments to detect plagiarism blocks
            blocks = evaluate(text, text2, size, threshold)
            print('Search ended, found {0} plagiated blocks'.format(
                len(blocks)))
        except:
            continue

        if not blocks:
            continue

        res['results'].append({
            'origin':
            result['url'],
            'title':
            result['name'],
            'keywords':
            keywords2,
            'blocks':
            blocks,
            'plagiarized':
            sum(block['plagiarized'] for block in blocks) / len(blocks)
        })

        print('File {0} processed in {1} seconds'.format(
            str(i + 1),
            timeit.default_timer() - start))

    return res
Пример #6
0
def calculate(w_sheet, r_sheet, r, image_map):
    for index in range(3):
        w_sheet.cell(row=r + 1, column=index + 1).value = r_sheet.cell(
            row=r + 1, column=index + 1).value

    compony = r_sheet.cell(row=r + 1, column=2).value
    name = r_sheet.cell(row=r + 1, column=3).value
    # ratio = r_sheet.cell(row=r + 1, column=10).value

    if not compony or not name:
        return

    print(r)
    key = compony + '_' + name
    if key in image_map:
        infos = image_map[key].split('_')
        for index in range(6):
            w_sheet.cell(row=r + 1, column=index + 4).value = infos[index + 2]
        try:
            ratio = get_fwhr('image/' + image_map[key], show=False)
            if ratio:
                w_sheet.cell(row=r + 1, column=10).value = ratio
        except BaseException as exct:
            print(exct)
            print(image_map[key])
        return

    result = {}
    imagePath = ''
    try:
        result = spider.search(compony, name)
        if not result:
            return
        w_sheet.cell(row=r + 1, column=4).value = result['SCO_NAME']
        w_sheet.cell(row=r + 1, column=5).value = result['CER_NUM']
        w_sheet.cell(row=r + 1, column=6).value = result['PTI_NAME']
        w_sheet.cell(row=r + 1, column=7).value = result['ECO_NAME']
        w_sheet.cell(row=r + 1, column=8).value = result['PPP_GET_DATE']
        w_sheet.cell(row=r + 1, column=9).value = result['PPP_END_DATE']

        personID = spider.getPersonID(result['PPP_ID'])
        if not personID:
            return
        imagePath = spider.getImagePath(personID)
        if not imagePath:
            return
    except BaseException as e:
        print(e)
        if str(e) == 'string indices must be integers':
            print("sleep")
            return

    if not result or not imagePath:
        return

    imagename = compony + '_' + name + '_' + result['SCO_NAME'] + '_' + result['CER_NUM'] \
                + '_' + result['PTI_NAME'] + '_' + result['ECO_NAME'] + '_' + \
                result['PPP_GET_DATE'] + '_' + result['PPP_END_DATE']
    try:
        ratio = get_fwhr(imagePath, url=True, show=False, imagename=imagename)
        if ratio:
            w_sheet.cell(row=r + 1, column=10).value = ratio
            print('ok')
    except BaseException as exct:
        print(exct)
        print(result)
        print(imagePath)
Пример #7
0
        year = int(date[0:2])
        
        next_day = day + 1
        previous_day = day - 1
        next_month = month
        previous_month = month
        
        if day == final_day[month]:
            next_day = 1
            next_month += 1
        elif day == 1:
            previous_day = final_day[month - 1]
            previous_month -= 1
        elif next_month > 12:
            next_month = 1
        elif previous_month < 1:
            previous_month = 12

        

        time = '20' + str(az(year)) + '-' + str(az(previous_month)) + '-' \
               + str(az(previous_day)) + '+~+20' + str(az(year)) + '-' \
               + str(az(next_month)) + '-' + str(az(next_day))

        rangetime = time
 
        spider.search(code)

        #recheck(code,pdf_save + file)
        
Пример #8
0
from spider import search, crawl
from utilities import cli
import sys


if __name__ == '__main__':
    key_word = input('Enter comic name: ')
    comics = search(key_word)
    if not comics:
        print('No item found')
        sys.exit()

    name = cli.select(comics.keys())
    comic, link = name, comics[name]
    ready = cli.confirm(f'Download {comic} now?')
    if ready:
        crawl(comic, link)

    print('DONE!')