def _countOccurencies(papers): # keep this connection in order to use multiprocessing db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: id, title = papers # title = 'CLEU‐A Cross‐Language English‐Urdu Corpus and Benchmark for Text Reuse Experiments' keywords = ["Cross-language".lower().strip(), "Crosslanguage".lower().strip(), "Cross-lingual".lower().strip(), "Crosslingual".lower().strip(), "Cross-linguistic".lower().strip(), "Crosslinguistic".lower().strip(), "Multi-language".lower().strip(), "Multilanguage".lower().strip(), "Multi-lingual".lower().strip(), "Multilingual".lower().strip(), "Multi-linguistic".lower().strip(), "Multilinguistic".lower().strip(), "Machine-translation".lower().strip(), "Copy".lower().strip(), "Duplicate".lower().strip(), "Plagiarism".lower().strip(), "Detection".lower().strip(), "Discovery".lower().strip()] nkeywords = len(keywords) text = _processText(title) words = _processNL(text) fdist = nltk.FreqDist(words) i = 0 while i < nkeywords: if fdist[str(keywords[i]).lower()] > 0: sql = "insert into resolved_papers_title_occurrencies values (%s, '%s', %s);" % ( id, str(keywords[i]).lower(), fdist[str(keywords[i]).lower()]) # print (sql) try: cur.execute(sql) db.commit() print('saved') except: db.rollback() i += 1 except: db.rollback() print('no saved') cur.close()
def _titlesLang(ids): # keep this connection in order to use multiprocessing db = pymysql.connect( host= "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: id, title = ids lang = _checkTitle(title) sql = "insert into resolved_papers_title values (%s, '%s');" % (id, lang) print(sql) cur.execute(sql) db.commit() print("saved") except: db.rollback() print('no saved') cur.close()
def _filterPub(papers): # keep this connection in order to use multiprocessing db = pymysql.connect( host= "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: id, type = papers threshold = 5 k_dflanguage_head = 0 k_dflanguage_tail = 0 k_copy_head = 0 k_copy_tail = 0 k_detection_head = 0 k_detection_tail = 0 diff_language = [ "Cross-language", "Crosslanguage", "Cross-lingual", "Crosslingual", "Cross-linguistic", "Crosslinguistic", "Multi-language", "Multilanguage", "Multi-lingual", "Multilingual", "Multi-linguistic", "Multilinguistic", "Machine-translation", ] copy = [ "Copy", "Duplicate", "Plagiarism", ] detection = [ "Detection", "Discovery", ] # id = 30061 sql = "select section, sstring, freq from resolved_papers_occurrenciesv4 where id = %s and type = 'paper'" % ( id) # res = pd.read_sql(sql, con=db) # print sql # try: cur.execute(sql) res = cur.fetchall() # except: # res = "" for row in res: i = 0 while i < len(diff_language): if str(diff_language[i]).lower() == row[1]: if row[0] == "head": k_dflanguage_head += row[2] if row[0] == "tail": k_dflanguage_tail += row[2] i += 1 i = 0 while i < len(copy): if str(copy[i]).lower() == row[1]: if row[0] == "head": k_copy_head += row[2] if row[0] == "tail": k_copy_tail += row[2] i += 1 i = 0 while i < len(detection): if str(detection[i]).lower() == row[1]: if row[0] == "head": k_detection_head += row[2] if row[0] == "tail": k_detection_tail += row[2] i += 1 # print("diff_language_head: %s. diff_language_tail: %s" % (k_dflanguage_head, k_dflanguage_tail)) # print("copy_head: %s. copy_tail: %s" % (k_copy_head, k_copy_tail)) # print("detection_head: %s. detection_tail: %s" % (k_detection_head, k_detection_tail)) # if (k_dflanguage_head >= threshold and k_dflanguage_tail >= threshold) and \ # (k_copy_head >= threshold and k_copy_tail >= threshold) and \ # (k_detection_head >= threshold and k_detection_tail >= threshold): if (k_dflanguage_head >= threshold and k_dflanguage_tail >= threshold) and \ (k_copy_head >= threshold and k_copy_tail >= threshold): # sql = 'select title from resolved_query where id = %s;' % (id) # try: # cur.execute(sql) # # res = cur.fetchall()[0][0] # res = db.fetchall() # except: # res = "" # if res: # sql = "update resolved_papers set toread = 1 where id = %s" % (id) # print sql # try: # cur.execute(sql) # db.commit() # except: # db.rollback() # papers_toread.append(id) print(id) return id # else: # return False except: pass
def _downloadIEEE(): db = pymysql.connect( host= "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() sql = "SELECT p.id, p.main_link, p.direct_link FROM `resolved_papers` p inner join `resolved_papers_title` pt on pt.Id = p.Id where p.source like '%ieee%' and p.downloaded = 0 and pt.`title_language` = 'en';" papers = pd.read_sql(sql, con=db) for index, row in papers.iterrows(): # id, main_link, direct_link = ids # direct_link = 'http://link.springer.com/article/10.1007/s10579-014-9282-3' id = row['id'] main_link = row['main_link'] # IEEE destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/' path = destination + str(id) + '.pdf' print(path) paper_id = (re.findall('\d+', main_link))[0] try: # path = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/4254.pdf' # paper_id = '7911954' url_pdf = 'wget "https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s" -O %s' % ( paper_id, path) os.system(url_pdf) # os.system('wget "https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s" -O %s') % (str(paper_id), path) # url_pdf = 'http://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s' % (paper_id) # print(url_pdf) # ua = str(get_random_ua()) # # try: # response = requests.get( # url_pdf, # headers={ # 'User-Agent': ua # } # ) # except: # print("Connection refused") # time.sleep(5) # # # print(response.status_code) # if response.status_code == 200: # # content_type = response.headers.get('content-type') # # if 'application/pdf' in str(content_type): # destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/' # path = destination + str(id) + '.pdf' # # with open(path, 'wb') as f: # f.write(response.content) # sql = "update resolved_papers set downloaded = 1 where id = %s" % ( id) try: cur.execute(sql) db.commit() print("Id: %s. Downloaded: True. Saved!" % (id)) except: db.rollback() # time.sleep(randint(1, 30)) # # else: # print('Title with identifier %s not found' # % (id)) except: print( 'Failed to fetch citeseerx page with identifier %s due to request exception.' % (id)) time.sleep(randint(1, 6))
def _downloadSpringer(ids): db = pymysql.connect( host= "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: id, main_link, direct_link = ids # direct_link = 'http://link.springer.com/article/10.1007/s10579-014-9282-3' # SPRINGER # if 'article' in main_link: # https://link.springer.com/article/10.1007/s10579-014-9282-3 # https://link.springer.com/content/pdf/10.1007%2Fs10579-014-9282-3.pdf url_pdf = main_link.replace('article', 'content/pdf') + '.pdf' elif 'chapter' in main_link: # http://link.springer.com/chapter/10.1007/978-3-319-09846-3_4/fulltext.html # https://link.springer.com/content/pdf/10.1007%2F978-3-319-09846-3.pdf # direct_link = main_link.replace('/fulltext.html', '') url_pdf = main_link.replace('chapter', 'content/pdf') + '.pdf' # # IEEE # # paper_id = (re.findall('\d+', main_link))[0] # url_pdf = 'http://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s' % (paper_id) # print(url_pdf) ua = str(get_random_ua()) try: response = requests.get(url_pdf, headers={'User-Agent': ua}) except: print("Connection refused") time.sleep(5) print(response.status_code) if response.status_code == 200: content_type = response.headers.get('content-type') if 'application/pdf' in str(content_type): destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/' path = destination + str(id) + '.pdf' with open(path, 'wb') as f: f.write(response.content) sql = "update resolved_papers set downloaded = 1 where id = %s" % ( id) try: cur.execute(sql) db.commit() print("Id: %s. Downloaded: True. Saved!" % (id)) except: db.rollback() else: print('Title with identifier %s not found' % (id)) except: print( 'Failed to fetch citeseerx page with identifier %s due to request exception.' % (id)) time.sleep(randint(1, 6))
def _download(ids): db = pymysql.connect( host= "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: id, query = ids params = urlencode({'q': query.lower()}, "UTF-8") url = SCHOLARS_BASE_URL + "/search?" + params print(url) ua = str(get_random_ua()) try: response = requests.get(url, headers={'User-Agent': ua}) except: print("Connection refused") time.sleep(5) print(response.status_code) if response.status_code == 200: data = response.text soup = BeautifulSoup(data, "html.parser") item = soup.find_all('div', {'class': 'result'})[0] if item: link = str(item.contents[1]).split('\n') title = "" title = re.sub('<[^<]+?>', '', link[2]) if query.lower() == title.lower(): # string = '/viewdoc/summary;jsessionid=4C1CD7E8F0D4A4E4BABAE601DE8D326F?doi=10.1.1.317.9673&rank=1' # suffix = re.sub(';.*\?', '?', string) # suffix = suffix.replace('summary', 'download').replace('&rank=1', '&rep=rep1&type=pdf') soup = BeautifulSoup(link[1]) a = soup.find("a", class_="remove doc_details") string = a.attrs['href'] suffix = re.sub(';.*\?', '?', string) suffix = suffix.replace('summary', 'download').replace( '&rank=1', '&rep=rep1&type=pdf') url_pdf = SCHOLARS_BASE_URL + suffix print(url_pdf) res = requests.get(url_pdf) content_type = res.headers.get('content-type') if 'application/pdf' in str(content_type): destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/' path = destination + str(id) + '.pdf' with open(path, 'wb') as f: f.write(res.content) sql = "update resolved_papers set downloaded = 1 where id = %s" % ( id) try: cur.execute(sql) db.commit() print("Id: %s. Downloaded: True. Saved!" % (id)) except: db.rollback() else: print('Title is not found with identifier %s' % (id)) except: print( 'Failed to fetch citeseerx page with identifier %s due to request exception.' % (id)) time.sleep(randint(1, 6))
def downloadPDFIEEE(ids): # keep this connection in order to use multiprocessing db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: i, res_title, main_link, direct_link = ids p = False downloaded = "False" count = 0 # to get the title language # sql = 'select title_language from resolved_papers_title where id = %s;' % (i) # cur.execute(sql) # res_title = cur.fetchall()[0][0] if res_title and res_title == "en": # or not res_title: print(i) print(main_link) print(direct_link) # url = _getUrl(i, "direct_link") # toParse = direct_link # paper_id = (re.findall('\d+', toParse))[0] url = direct_link while downloaded == "False" and count < 2: count += 1 if count == 2: file = requests.get(url) open('/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/%s.pdf', 'wb').write(file.content) % (i) p = True else: if url: s = DownloadPDF() p = s.download(url, destination='/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/', path=str(i) + '.pdf') if p == True: downloaded = "True" else: # url = _getUrl(i, "main_link") toParse = main_link paper_id = (re.findall('\d+', toParse))[0] # url = 'http://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s' % (paper_id) url = 'https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=%s' % (paper_id) if downloaded == "True": # sql = "update resolved_papers2019_unique set downloaded = 1 where id = %s" % (i) sql = "update resolved_papers set downloaded = 1 where id = %s" % (i) try: cur.execute(sql) db.commit() print("Id: %s. Downloaded: %s. Saved!" % (i, downloaded)) except: db.rollback() else: print("Id: %s. Downloaded: %s." % (i, downloaded)) except UnicodeDecodeError: pass cur.close()
def downloadPDF(ids): # keep this connection in order to use multiprocessing db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: i, res_title, main_link, direct_link = ids p = False downloaded = "False" count = 0 # to get the title language # sql = 'select title_language from resolved_papers_title where id = %s;' % (i) # cur.execute(sql) # res_title = cur.fetchall()[0][0] if res_title and res_title == "en": # or not res_title: print(i) print(main_link) print(direct_link) # url = _getUrl(i, "direct_link") url = direct_link while downloaded == "False" and count < 2: count += 1 if url: s = DownloadPDF() p = s.download(url, destination='/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/', path=str(i) + '.pdf') if p == True: downloaded = "True" else: # url = _getUrl(i, "main_link") url = main_link if downloaded == "True": # sql = "update resolved_papers2019_unique set downloaded = 1 where id = %s" % (i) sql = "update resolved_papers set downloaded = 1 where id = %s" % (i) try: cur.execute(sql) db.commit() print("Id: %s. Downloaded: %s. Saved!" % (i, downloaded)) except: db.rollback() else: print("Id: %s. Downloaded: %s." % (i, downloaded)) except UnicodeDecodeError: pass cur.close()
def _filterTitle(papers): # keep this connection in order to use multiprocessing db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: id, title = papers threshold = 1 # title = 'A New Approach for Cross-Language Plagiarism Analysis.'.lower() title = title.lower() k_dflanguage = 0 k_copy = 0 k_detection = 0 diff_language = ["Cross-language", "Crosslanguage", "Cross-lingual", "Crosslingual", "Cross-linguistic", "Crosslinguistic", "Multi-language", "Multilanguage", "Multi-lingual", "Multilingual", "Multi-linguistic", "Multilinguistic", "Machine-translation", ] copy = ["Copy", "Duplicate", "Plagiarism", ] detection = ["Detection", "Discovery", ] for row in diff_language: if row.lower() in title: k_dflanguage += 1 for row in copy: if row.lower() in title: k_copy += 1 for row in detection: if row.lower() in title: k_detection += 1 print("diff_language: %s." % (k_dflanguage)) print("copy: %s." % (k_copy)) print("detection: %s." % (k_detection)) if (k_dflanguage >= threshold or k_detection >= threshold or k_detection >= threshold): # papers_selected.append(id, title) sql = "insert into resolved_papers_selected_title values (%s)" % (id) print(sql) # try: cur.execute(sql) db.commit() # except: db.rollback() return True else: return False except: db.rollback() print('no saved') cur.close()
def downloadPDF(ids): # keep this connection in order to use multiprocessing db = pymysql.connect( host= "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com", # your host, usually localhost user="******", # your username passwd="iwJx0EAM", # your password db="clpd") # name of the data base cur = db.cursor() try: i, main_link, direct_link = ids p = False downloaded = "False" count = 0 print(i) print(main_link) print(direct_link) # url = _getUrl(i, "direct_link") url = direct_link while downloaded == "False" and count < 2: count += 1 if url: # url = 'https://s3.amazonaws.com/academia.edu/download/30761819/book.pdf?response-content-disposition=inline%3B%20filename%3DUsing_monolingual_clickthrough_data_to_b.pdf&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWOWYYGZ2Y53UL3A%2F20190908%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20190908T222002Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=eee90437409f359612d0a47e04739fb0733d3eb347c3d6e4145596986966b26a#page=32' # https://s3.amazonaws.com/academia.edu.documents/30761819/book.pdf?response-content-disposition=inline%3B%20filename%3DUsing_monolingual_clickthrough_data_to_b.pdf&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWOWYYGZ2Y53UL3A%2F20190908%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20190908T222002Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=eee90437409f359612d0a47e04739fb0733d3eb347c3d6e4145596986966b26a#page=32 url = "http://www.academia.edu/download/30761819/book.pdf#page=32" # http://www.academia.edu/download/30761819/book.pdf#page=32 # url = 'http://google.com' i = 149 destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/' path = destination + str(i) + '.pdf' try: ua = UserAgent() headers = {'User-Agent': str(ua.random)} r = requests.head( 'http://www.academia.edu/download/30761819/book.pdf#page=32', allow_redirects=True) print(r.url) s = requests.session() res = s.get(url, headers=headers, allow_redirects=False) print(res.url) # print(finalurl) p = urlretrieve(url, path) if p[1].get_content_type() == 'application/pdf': downloaded = "True" except: pass else: # url = _getUrl(i, "main_link") url = main_link if downloaded == "True": # sql = "update resolved_papers2019_unique set downloaded = 1 where id = %s" % (i) sql = "update resolved_papers set downloaded = 1 where id = %s" % ( i) try: cur.execute(sql) db.commit() print("Id: %s. Downloaded: %s. Saved!" % (i, downloaded)) except: db.rollback() else: print("Id: %s. Downloaded: %s." % (i, downloaded)) except UnicodeDecodeError: pass cur.close()