def splitFiles(type): # type = 'head' if type == 'head': sql = 'select id, npages from resolved_papers where downloaded = 1 and npages >= 5 and pdf2text = 1 and english = 1 and %s = 1 and id in (1702);' % ( type) else: sql = 'select id, npages from resolved_papers where downloaded = 1 and npages >= 5 and pdf2text = 1 and english = 1 and head = 1 and %s = 1 and id in (1702);' % ( type) print(sql) papers = pd.read_sql(sql, con=db) for index, row in papers.iterrows(): # row = [30092, 9,] if type == 'head': # print ('entra') res = _getHead(row['id'], row['npages']) elif type == 'tail': res = _getTail(row['id'], row['npages']) if res: sql = "update resolved_papers set %s = 1 where id = %s" % (type, row[0]) try: cur.execute(sql) db.commit() except: db.rollback() print("Id: %s. %s: %s" % (row[0], type.title(), res)) cur.close()
def prepareTail(): sql = 'select id from resolved_papers where downloaded = 1 and npages >= 5 and pdf2text = 1 and english = 1 and head = 1 and tail = 1 and id in (1702);' papers = pd.read_sql(sql, con=db) for index, row in papers.iterrows(): res = _prepareTail(row[0]) print("Id: %s. Tail prepared: %s" % (row[0], res)) cur.close()
def _countOccurencies(papers): # keep this connection in order to use multiprocessing db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: id, title = papers # title = 'CLEU‐A Cross‐Language English‐Urdu Corpus and Benchmark for Text Reuse Experiments' keywords = ["Cross-language".lower().strip(), "Crosslanguage".lower().strip(), "Cross-lingual".lower().strip(), "Crosslingual".lower().strip(), "Cross-linguistic".lower().strip(), "Crosslinguistic".lower().strip(), "Multi-language".lower().strip(), "Multilanguage".lower().strip(), "Multi-lingual".lower().strip(), "Multilingual".lower().strip(), "Multi-linguistic".lower().strip(), "Multilinguistic".lower().strip(), "Machine-translation".lower().strip(), "Copy".lower().strip(), "Duplicate".lower().strip(), "Plagiarism".lower().strip(), "Detection".lower().strip(), "Discovery".lower().strip()] nkeywords = len(keywords) text = _processText(title) words = _processNL(text) fdist = nltk.FreqDist(words) i = 0 while i < nkeywords: if fdist[str(keywords[i]).lower()] > 0: sql = "insert into resolved_papers_title_occurrencies values (%s, '%s', %s);" % ( id, str(keywords[i]).lower(), fdist[str(keywords[i]).lower()]) # print (sql) try: cur.execute(sql) db.commit() print('saved') except: db.rollback() i += 1 except: db.rollback() print('no saved') cur.close()
def classifyPub(): sql = 'select id, npages from resolved_papers where downloaded = 1 and npages >= 5 and pdf2text = 1 and english = 1 and head = 1 and tail = 1 and type is NULL;' papers = pd.read_sql(sql, con=db) for index, row in papers.iterrows(): res = _classifyPub(row[0], row[1]) if res: sql = "update resolved_papers set type = '%s' where id = %s" % ( res, row[0]) print(sql) try: cur.execute(sql) db.commit() except: db.rollback() print("Id: %s. type: %s" % (row[0], res)) cur.close()
def updateNumPages(): sql = 'select id from resolved_papers where downloaded = 1 and npages is NULL;' papers = pd.read_sql(sql, con=db) for index, row in papers.iterrows(): # print (row['id']) pages = _getNPages(row['id']) # print ((row['id'], pages)) if pages: sql = "update resolved_papers set npages = %s where id = %s" % ( pages, row[0]) try: cur.execute(sql) db.commit() except: db.rollback() print("Id: %s. Num Pages: %s" % (row[0], pages)) cur.close()
def _titlesLang(ids): # keep this connection in order to use multiprocessing db = pymysql.connect( host= "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: id, title = ids lang = _checkTitle(title) sql = "insert into resolved_papers_title values (%s, '%s');" % (id, lang) print(sql) cur.execute(sql) db.commit() print("saved") except: db.rollback() print('no saved') cur.close()
def downloadPDFIEEE(ids): # keep this connection in order to use multiprocessing db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: i, res_title, main_link, direct_link = ids p = False downloaded = "False" count = 0 # to get the title language # sql = 'select title_language from resolved_papers_title where id = %s;' % (i) # cur.execute(sql) # res_title = cur.fetchall()[0][0] if res_title and res_title == "en": # or not res_title: print(i) print(main_link) print(direct_link) # url = _getUrl(i, "direct_link") # toParse = direct_link # paper_id = (re.findall('\d+', toParse))[0] url = direct_link while downloaded == "False" and count < 2: count += 1 if count == 2: file = requests.get(url) open('/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/%s.pdf', 'wb').write(file.content) % (i) p = True else: if url: s = DownloadPDF() p = s.download(url, destination='/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/', path=str(i) + '.pdf') if p == True: downloaded = "True" else: # url = _getUrl(i, "main_link") toParse = main_link paper_id = (re.findall('\d+', toParse))[0] # url = 'http://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s' % (paper_id) url = 'https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=%s' % (paper_id) if downloaded == "True": # sql = "update resolved_papers2019_unique set downloaded = 1 where id = %s" % (i) sql = "update resolved_papers set downloaded = 1 where id = %s" % (i) try: cur.execute(sql) db.commit() print("Id: %s. Downloaded: %s. Saved!" % (i, downloaded)) except: db.rollback() else: print("Id: %s. Downloaded: %s." % (i, downloaded)) except UnicodeDecodeError: pass cur.close()
def downloadPDF(ids): # keep this connection in order to use multiprocessing db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: i, res_title, main_link, direct_link = ids p = False downloaded = "False" count = 0 # to get the title language # sql = 'select title_language from resolved_papers_title where id = %s;' % (i) # cur.execute(sql) # res_title = cur.fetchall()[0][0] if res_title and res_title == "en": # or not res_title: print(i) print(main_link) print(direct_link) # url = _getUrl(i, "direct_link") url = direct_link while downloaded == "False" and count < 2: count += 1 if url: s = DownloadPDF() p = s.download(url, destination='/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/', path=str(i) + '.pdf') if p == True: downloaded = "True" else: # url = _getUrl(i, "main_link") url = main_link if downloaded == "True": # sql = "update resolved_papers2019_unique set downloaded = 1 where id = %s" % (i) sql = "update resolved_papers set downloaded = 1 where id = %s" % (i) try: cur.execute(sql) db.commit() print("Id: %s. Downloaded: %s. Saved!" % (i, downloaded)) except: db.rollback() else: print("Id: %s. Downloaded: %s." % (i, downloaded)) except UnicodeDecodeError: pass cur.close()
def _filterTitle(papers): # keep this connection in order to use multiprocessing db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: id, title = papers threshold = 1 # title = 'A New Approach for Cross-Language Plagiarism Analysis.'.lower() title = title.lower() k_dflanguage = 0 k_copy = 0 k_detection = 0 diff_language = ["Cross-language", "Crosslanguage", "Cross-lingual", "Crosslingual", "Cross-linguistic", "Crosslinguistic", "Multi-language", "Multilanguage", "Multi-lingual", "Multilingual", "Multi-linguistic", "Multilinguistic", "Machine-translation", ] copy = ["Copy", "Duplicate", "Plagiarism", ] detection = ["Detection", "Discovery", ] for row in diff_language: if row.lower() in title: k_dflanguage += 1 for row in copy: if row.lower() in title: k_copy += 1 for row in detection: if row.lower() in title: k_detection += 1 print("diff_language: %s." % (k_dflanguage)) print("copy: %s." % (k_copy)) print("detection: %s." % (k_detection)) if (k_dflanguage >= threshold or k_detection >= threshold or k_detection >= threshold): # papers_selected.append(id, title) sql = "insert into resolved_papers_selected_title values (%s)" % (id) print(sql) # try: cur.execute(sql) db.commit() # except: db.rollback() return True else: return False except: db.rollback() print('no saved') cur.close()
def downloadPDF(ids): # keep this connection in order to use multiprocessing db = pymysql.connect( host= "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: i, main_link, direct_link = ids p = False downloaded = "False" count = 0 print(i) print(main_link) print(direct_link) # url = _getUrl(i, "direct_link") url = direct_link while downloaded == "False" and count < 2: count += 1 if url: # url = 'https://s3.amazonaws.com/academia.edu/download/30761819/book.pdf?response-content-disposition=inline%3B%20filename%3DUsing_monolingual_clickthrough_data_to_b.pdf&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWOWYYGZ2Y53UL3A%2F20190908%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20190908T222002Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=eee90437409f359612d0a47e04739fb0733d3eb347c3d6e4145596986966b26a#page=32' # https://s3.amazonaws.com/academia.edu.documents/30761819/book.pdf?response-content-disposition=inline%3B%20filename%3DUsing_monolingual_clickthrough_data_to_b.pdf&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWOWYYGZ2Y53UL3A%2F20190908%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20190908T222002Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=eee90437409f359612d0a47e04739fb0733d3eb347c3d6e4145596986966b26a#page=32 url = "http://www.academia.edu/download/30761819/book.pdf#page=32" # http://www.academia.edu/download/30761819/book.pdf#page=32 # url = 'http://google.com' i = 149 destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/' path = destination + str(i) + '.pdf' try: ua = UserAgent() headers = {'User-Agent': str(ua.random)} r = requests.head( 'http://www.academia.edu/download/30761819/book.pdf#page=32', allow_redirects=True) print(r.url) s = requests.session() res = s.get(url, headers=headers, allow_redirects=False) print(res.url) # print(finalurl) p = urlretrieve(url, path) if p[1].get_content_type() == 'application/pdf': downloaded = "True" except: pass else: # url = _getUrl(i, "main_link") url = main_link if downloaded == "True": # sql = "update resolved_papers2019_unique set downloaded = 1 where id = %s" % (i) sql = "update resolved_papers set downloaded = 1 where id = %s" % ( i) try: cur.execute(sql) db.commit() print("Id: %s. Downloaded: %s. Saved!" % (i, downloaded)) except: db.rollback() else: print("Id: %s. Downloaded: %s." % (i, downloaded)) except UnicodeDecodeError: pass cur.close()