def recheck(code,file): if os.path.getsize(file) <= 15000: txt_path = './txt/' + file[6:len(file)-4] + '.txt' cmd = 'node extract_text.js ' + file + " " + txt_path print(cmd) os.system(cmd) if os.path.exists(txt_path) == False: spider.search(code)
def eventloop(sender=ConsoleSender()): urls = input('URLs(separate with semicolons):').split('; ') wordinput = input('Keywords(separate with semicolons):') pattern = [] for words in wordinput.split(';'): if not words.strip(): continue subpattern = [] for word in words.strip().split(): if not word: continue # rid '-' neg = word[0] == '-' if neg: word = word[1:] lo = analyzer.Contains(word) if neg: lo = analyzer.NegPattern(lo) subpattern.append(lo) # print(subpattern) pattern.append(analyzer.AndPattern(*subpattern)) pattern = analyzer.OrPattern(*pattern) pattern = analyzer.filter_tag('title')(pattern) send_urls = set() send_lines = [] def handler(url, document): res = pattern.search(document) print(res) if res: bs = bs4.BeautifulSoup(document, "html5lib") title = bs.find('title').get_text() if url not in send_urls: send_urls.add(url) send_lines.append('{}: {}'.format(title, url)) try: while True: for url in urls: search(url, 2, handler, html_rendering=False) if (send_lines): sender.send( 'Topic Alert', """The messages you subscribed are found here: {}""".format('\n'.join(send_lines))) send_lines = [] print("Sent") sleep_secs = 900 print(f'Next scan in {sleep_secs} seconds') time.sleep(sleep_secs) except KeyboardInterrupt: print("Stopped")
async def on_message(message): if message.content == "H": emb = discord.Embed() emb.set_image(url="https://i.stack.imgur.com/LDUmI.png") await message.channel.send(content=None, embed=emb) print(message.content) target = spider.search(message.content) if target != "NOT FOUND" and message.author.nick != "TEST_LOL": img = spider.web_crawl(target) for text in img: await message.channel.send(content=text)
def calculate_simple(w_sheet, r_sheet, r): print(r) for index in range(3): w_sheet.cell(row=r + 1, column=index + 1).value = r_sheet.cell( row=r + 1, column=index + 1).value compony = r_sheet.cell(row=r + 1, column=3).value name = r_sheet.cell(row=r + 1, column=2).value ratio = r_sheet.cell(row=r + 1, column=6).value if ratio: print('pass') return if not compony or not name: return result = {} imagePath = '' try: result = spider.search(compony, name) if not result: return w_sheet.cell(row=r + 1, column=4).value = result['SCO_NAME'] w_sheet.cell(row=r + 1, column=5).value = result['ECO_NAME'] personID = result['personID'] if not personID: return imagePath = spider.getImagePath(personID) if not imagePath: return except BaseException as e: print(e) if str(e) == 'string indices must be integers': print("sleep") return if not result or not imagePath: return imagename = compony + '_' + name + '_' + result['SCO_NAME'] + '_' + result['CER_NUM'] \ + '_' + result['PTI_NAME'] + '_' + result['ECO_NAME'] + '_' + \ result['PPP_GET_DATE'] + '_' + result['PPP_END_DATE'] try: ratio = get_fwhr(imagePath, url=True, show=False, imagename=imagename) if ratio: w_sheet.cell(row=r + 1, column=6).value = ratio print('ok: {}'.format(ratio)) except BaseException as exct: print(exct) print(result) print(imagePath)
def checkfile(file, size=7, threshold=0.1): res = {'input': file, 'keywords': [], 'results': []} # read file fulltext text = readfile(file) if text is None: print('This file appears to be invalid') return res # get keywords keywords = getkeywords(text) print('Keywords: ', ', '.join(keywords)) res['keywords'] = keywords # search by keywords results = search(keywords).results() if not results: print('No result.') return res # processing files for i, result in enumerate(results): try: print('\nProcessing file {0}: {1} ({2})'.format( str(i + 1), result['name'], result['url'])) start = timeit.default_timer() text2 = readfile(result['url']) if text2 is None: print('This file appears to be invalid') continue # get keywords keywords2 = getkeywords(text2) print('Keywords: ', ', '.join(keywords2)) # compare segments to detect plagiarism blocks blocks = evaluate(text, text2, size, threshold) print('Search ended, found {0} plagiated blocks'.format( len(blocks))) except: continue if not blocks: continue res['results'].append({ 'origin': result['url'], 'title': result['name'], 'keywords': keywords2, 'blocks': blocks, 'plagiarized': sum(block['plagiarized'] for block in blocks) / len(blocks) }) print('File {0} processed in {1} seconds'.format( str(i + 1), timeit.default_timer() - start)) return res
def calculate(w_sheet, r_sheet, r, image_map): for index in range(3): w_sheet.cell(row=r + 1, column=index + 1).value = r_sheet.cell( row=r + 1, column=index + 1).value compony = r_sheet.cell(row=r + 1, column=2).value name = r_sheet.cell(row=r + 1, column=3).value # ratio = r_sheet.cell(row=r + 1, column=10).value if not compony or not name: return print(r) key = compony + '_' + name if key in image_map: infos = image_map[key].split('_') for index in range(6): w_sheet.cell(row=r + 1, column=index + 4).value = infos[index + 2] try: ratio = get_fwhr('image/' + image_map[key], show=False) if ratio: w_sheet.cell(row=r + 1, column=10).value = ratio except BaseException as exct: print(exct) print(image_map[key]) return result = {} imagePath = '' try: result = spider.search(compony, name) if not result: return w_sheet.cell(row=r + 1, column=4).value = result['SCO_NAME'] w_sheet.cell(row=r + 1, column=5).value = result['CER_NUM'] w_sheet.cell(row=r + 1, column=6).value = result['PTI_NAME'] w_sheet.cell(row=r + 1, column=7).value = result['ECO_NAME'] w_sheet.cell(row=r + 1, column=8).value = result['PPP_GET_DATE'] w_sheet.cell(row=r + 1, column=9).value = result['PPP_END_DATE'] personID = spider.getPersonID(result['PPP_ID']) if not personID: return imagePath = spider.getImagePath(personID) if not imagePath: return except BaseException as e: print(e) if str(e) == 'string indices must be integers': print("sleep") return if not result or not imagePath: return imagename = compony + '_' + name + '_' + result['SCO_NAME'] + '_' + result['CER_NUM'] \ + '_' + result['PTI_NAME'] + '_' + result['ECO_NAME'] + '_' + \ result['PPP_GET_DATE'] + '_' + result['PPP_END_DATE'] try: ratio = get_fwhr(imagePath, url=True, show=False, imagename=imagename) if ratio: w_sheet.cell(row=r + 1, column=10).value = ratio print('ok') except BaseException as exct: print(exct) print(result) print(imagePath)
year = int(date[0:2]) next_day = day + 1 previous_day = day - 1 next_month = month previous_month = month if day == final_day[month]: next_day = 1 next_month += 1 elif day == 1: previous_day = final_day[month - 1] previous_month -= 1 elif next_month > 12: next_month = 1 elif previous_month < 1: previous_month = 12 time = '20' + str(az(year)) + '-' + str(az(previous_month)) + '-' \ + str(az(previous_day)) + '+~+20' + str(az(year)) + '-' \ + str(az(next_month)) + '-' + str(az(next_day)) rangetime = time spider.search(code) #recheck(code,pdf_save + file)
from spider import search, crawl from utilities import cli import sys if __name__ == '__main__': key_word = input('Enter comic name: ') comics = search(key_word) if not comics: print('No item found') sys.exit() name = cli.select(comics.keys()) comic, link = name, comics[name] ready = cli.confirm(f'Download {comic} now?') if ready: crawl(comic, link) print('DONE!')