Пример #1
0
def _countOccurencies(papers):
    # keep this connection in order to use multiprocessing
    db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
                         user="******",  # your username
                         passwd="iwJx0EAM",  # your password
                         db="clpd")  # name of the data base

    cur = db.cursor()

    try:
        id, title = papers

        # title = 'CLEU‐A Cross‐Language English‐Urdu Corpus and Benchmark for Text Reuse Experiments'

        keywords = ["Cross-language".lower().strip(),
                    "Crosslanguage".lower().strip(),
                    "Cross-lingual".lower().strip(),
                    "Crosslingual".lower().strip(),
                    "Cross-linguistic".lower().strip(),
                    "Crosslinguistic".lower().strip(),
                    "Multi-language".lower().strip(),
                    "Multilanguage".lower().strip(),
                    "Multi-lingual".lower().strip(),
                    "Multilingual".lower().strip(),
                    "Multi-linguistic".lower().strip(),
                    "Multilinguistic".lower().strip(),
                    "Machine-translation".lower().strip(),
                    "Copy".lower().strip(),
                    "Duplicate".lower().strip(),
                    "Plagiarism".lower().strip(),
                    "Detection".lower().strip(),
                    "Discovery".lower().strip()]
        nkeywords = len(keywords)
        text = _processText(title)
        words = _processNL(text)
        fdist = nltk.FreqDist(words)


        i = 0
        while i < nkeywords:

            if fdist[str(keywords[i]).lower()] > 0:
                sql = "insert into resolved_papers_title_occurrencies values (%s, '%s', %s);" % (
                    id, str(keywords[i]).lower(), fdist[str(keywords[i]).lower()])
                # print (sql)
                try:
                    cur.execute(sql)
                    db.commit()
                    print('saved')
                except:
                    db.rollback()
            i += 1
    except:
        db.rollback()
        print('no saved')
    cur.close()
Пример #2
0
def _titlesLang(ids):
    # keep this connection in order to use multiprocessing
    db = pymysql.connect(
        host=
        "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
        user="******",  # your username
        passwd="iwJx0EAM",  # your password
        db="clpd")  # name of the data base

    cur = db.cursor()

    try:
        id, title = ids
        lang = _checkTitle(title)
        sql = "insert into resolved_papers_title values (%s, '%s');" % (id,
                                                                        lang)
        print(sql)
        cur.execute(sql)
        db.commit()
        print("saved")
    except:
        db.rollback()
        print('no saved')
    cur.close()
Пример #3
0
def _filterPub(papers):
    # keep this connection in order to use multiprocessing
    db = pymysql.connect(
        host=
        "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
        user="******",  # your username
        passwd="iwJx0EAM",  # your password
        db="clpd")  # name of the data base

    cur = db.cursor()

    try:

        id, type = papers

        threshold = 5

        k_dflanguage_head = 0
        k_dflanguage_tail = 0
        k_copy_head = 0
        k_copy_tail = 0
        k_detection_head = 0
        k_detection_tail = 0

        diff_language = [
            "Cross-language",
            "Crosslanguage",
            "Cross-lingual",
            "Crosslingual",
            "Cross-linguistic",
            "Crosslinguistic",
            "Multi-language",
            "Multilanguage",
            "Multi-lingual",
            "Multilingual",
            "Multi-linguistic",
            "Multilinguistic",
            "Machine-translation",
        ]

        copy = [
            "Copy",
            "Duplicate",
            "Plagiarism",
        ]

        detection = [
            "Detection",
            "Discovery",
        ]
        # id = 30061
        sql = "select section, sstring, freq from resolved_papers_occurrenciesv4 where id = %s and type = 'paper'" % (
            id)
        # res = pd.read_sql(sql, con=db)
        # print sql
        # try:
        cur.execute(sql)
        res = cur.fetchall()

        # except:
        #     res = ""
        for row in res:
            i = 0
            while i < len(diff_language):
                if str(diff_language[i]).lower() == row[1]:
                    if row[0] == "head":
                        k_dflanguage_head += row[2]
                    if row[0] == "tail":
                        k_dflanguage_tail += row[2]
                i += 1
            i = 0
            while i < len(copy):
                if str(copy[i]).lower() == row[1]:
                    if row[0] == "head":
                        k_copy_head += row[2]
                    if row[0] == "tail":
                        k_copy_tail += row[2]
                i += 1
            i = 0
            while i < len(detection):
                if str(detection[i]).lower() == row[1]:
                    if row[0] == "head":
                        k_detection_head += row[2]
                    if row[0] == "tail":
                        k_detection_tail += row[2]
                i += 1

        # print("diff_language_head: %s. diff_language_tail: %s" % (k_dflanguage_head, k_dflanguage_tail))
        # print("copy_head: %s. copy_tail: %s" % (k_copy_head, k_copy_tail))
        # print("detection_head: %s. detection_tail: %s" % (k_detection_head, k_detection_tail))

        # if (k_dflanguage_head >= threshold and k_dflanguage_tail >= threshold) and \
        #         (k_copy_head >= threshold and k_copy_tail >= threshold) and \
        #         (k_detection_head >= threshold and k_detection_tail >= threshold):
        if (k_dflanguage_head >= threshold and k_dflanguage_tail >= threshold) and \
                (k_copy_head >= threshold and k_copy_tail >= threshold):
            # sql = 'select title from resolved_query where id = %s;' % (id)
            # try:
            #     cur.execute(sql)
            #     # res = cur.fetchall()[0][0]
            #     res = db.fetchall()
            # except:
            #     res = ""
            # if res:
            # sql = "update resolved_papers set toread = 1 where id = %s" % (id)
            # print sql
            # try:
            # cur.execute(sql)
            # db.commit()
            # except:
            # db.rollback()
            # papers_toread.append(id)
            print(id)

            return id
        # else:
        #     return False
    except:
        pass
Пример #4
0
def _downloadIEEE():
    db = pymysql.connect(
        host=
        "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
        user="******",  # your username
        passwd="iwJx0EAM",  # your password
        db="clpd")  # name of the data base

    cur = db.cursor()

    sql = "SELECT p.id, p.main_link, p.direct_link FROM `resolved_papers` p inner join `resolved_papers_title` pt on pt.Id = p.Id where p.source like '%ieee%' and p.downloaded = 0 and pt.`title_language` = 'en';"
    papers = pd.read_sql(sql, con=db)

    for index, row in papers.iterrows():

        # id, main_link, direct_link = ids
        # direct_link = 'http://link.springer.com/article/10.1007/s10579-014-9282-3'
        id = row['id']
        main_link = row['main_link']

        # IEEE

        destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/'
        path = destination + str(id) + '.pdf'
        print(path)

        paper_id = (re.findall('\d+', main_link))[0]
        try:
            # path = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/4254.pdf'
            # paper_id = '7911954'

            url_pdf = 'wget "https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s" -O %s' % (
                paper_id, path)
            os.system(url_pdf)

            # os.system('wget "https://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s" -O %s') % (str(paper_id), path)
            # url_pdf = 'http://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s' % (paper_id)
            # print(url_pdf)

            # ua = str(get_random_ua())
            #
            # try:
            #     response = requests.get(
            #         url_pdf,
            #         headers={
            #             'User-Agent': ua
            #         }
            #     )
            # except:
            #     print("Connection refused")
            #     time.sleep(5)
            #
            #
            # print(response.status_code)
            # if response.status_code == 200:
            #
            #     content_type = response.headers.get('content-type')
            #
            #     if 'application/pdf' in str(content_type):
            #         destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/'
            #         path = destination + str(id) + '.pdf'
            #
            #         with open(path, 'wb') as f:
            #             f.write(response.content)
            #
            sql = "update resolved_papers set downloaded = 1 where id = %s" % (
                id)

            try:
                cur.execute(sql)
                db.commit()
                print("Id: %s. Downloaded: True. Saved!" % (id))
            except:
                db.rollback()

            # time.sleep(randint(1, 30))
        #
        #     else:
        #         print('Title with identifier %s not found'
        #               % (id))
        except:
            print(
                'Failed to fetch citeseerx page with identifier %s due to request exception.'
                % (id))

        time.sleep(randint(1, 6))
Пример #5
0
def _downloadSpringer(ids):
    db = pymysql.connect(
        host=
        "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
        user="******",  # your username
        passwd="iwJx0EAM",  # your password
        db="clpd")  # name of the data base

    cur = db.cursor()

    try:
        id, main_link, direct_link = ids
        # direct_link = 'http://link.springer.com/article/10.1007/s10579-014-9282-3'

        # SPRINGER #

        if 'article' in main_link:
            # https://link.springer.com/article/10.1007/s10579-014-9282-3
            # https://link.springer.com/content/pdf/10.1007%2Fs10579-014-9282-3.pdf
            url_pdf = main_link.replace('article', 'content/pdf') + '.pdf'

        elif 'chapter' in main_link:
            # http://link.springer.com/chapter/10.1007/978-3-319-09846-3_4/fulltext.html
            # https://link.springer.com/content/pdf/10.1007%2F978-3-319-09846-3.pdf
            # direct_link = main_link.replace('/fulltext.html', '')
            url_pdf = main_link.replace('chapter', 'content/pdf') + '.pdf'

        # # IEEE
        #
        # paper_id = (re.findall('\d+', main_link))[0]
        # url_pdf = 'http://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s' % (paper_id)
        # print(url_pdf)

        ua = str(get_random_ua())

        try:
            response = requests.get(url_pdf, headers={'User-Agent': ua})
        except:
            print("Connection refused")
            time.sleep(5)

        print(response.status_code)
        if response.status_code == 200:

            content_type = response.headers.get('content-type')

            if 'application/pdf' in str(content_type):
                destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/'
                path = destination + str(id) + '.pdf'

                with open(path, 'wb') as f:
                    f.write(response.content)

                sql = "update resolved_papers set downloaded = 1 where id = %s" % (
                    id)

                try:
                    cur.execute(sql)
                    db.commit()
                    print("Id: %s. Downloaded: True. Saved!" % (id))
                except:
                    db.rollback()

            else:
                print('Title with identifier %s not found' % (id))
    except:
        print(
            'Failed to fetch citeseerx page with identifier %s due to request exception.'
            % (id))

    time.sleep(randint(1, 6))
Пример #6
0
def _download(ids):
    db = pymysql.connect(
        host=
        "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
        user="******",  # your username
        passwd="iwJx0EAM",  # your password
        db="clpd")  # name of the data base

    cur = db.cursor()

    try:
        id, query = ids

        params = urlencode({'q': query.lower()}, "UTF-8")

        url = SCHOLARS_BASE_URL + "/search?" + params

        print(url)

        ua = str(get_random_ua())

        try:
            response = requests.get(url, headers={'User-Agent': ua})
        except:
            print("Connection refused")
            time.sleep(5)

        print(response.status_code)
        if response.status_code == 200:

            data = response.text
            soup = BeautifulSoup(data, "html.parser")

            item = soup.find_all('div', {'class': 'result'})[0]

            if item:
                link = str(item.contents[1]).split('\n')
                title = ""
                title = re.sub('<[^<]+?>', '', link[2])

            if query.lower() == title.lower():

                # string = '/viewdoc/summary;jsessionid=4C1CD7E8F0D4A4E4BABAE601DE8D326F?doi=10.1.1.317.9673&rank=1'
                # suffix = re.sub(';.*\?', '?', string)
                # suffix = suffix.replace('summary', 'download').replace('&rank=1', '&rep=rep1&type=pdf')

                soup = BeautifulSoup(link[1])
                a = soup.find("a", class_="remove doc_details")
                string = a.attrs['href']

                suffix = re.sub(';.*\?', '?', string)
                suffix = suffix.replace('summary', 'download').replace(
                    '&rank=1', '&rep=rep1&type=pdf')

                url_pdf = SCHOLARS_BASE_URL + suffix
                print(url_pdf)

                res = requests.get(url_pdf)
                content_type = res.headers.get('content-type')

                if 'application/pdf' in str(content_type):
                    destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/'
                    path = destination + str(id) + '.pdf'

                    with open(path, 'wb') as f:
                        f.write(res.content)

                sql = "update resolved_papers set downloaded = 1 where id = %s" % (
                    id)

                try:
                    cur.execute(sql)
                    db.commit()
                    print("Id: %s. Downloaded: True. Saved!" % (id))
                except:
                    db.rollback()

            else:
                print('Title is not found with identifier %s' % (id))
    except:
        print(
            'Failed to fetch citeseerx page with identifier %s due to request exception.'
            % (id))

    time.sleep(randint(1, 6))
Пример #7
0
def downloadPDFIEEE(ids):
    # keep this connection in order to use multiprocessing
    db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
                         user="******",  # your username
                         passwd="iwJx0EAM",  # your password
                         db="clpd")  # name of the data base

    cur = db.cursor()

    try:
        i, res_title, main_link, direct_link = ids


        p = False
        downloaded = "False"
        count = 0



        # to get the title language
        # sql = 'select title_language from resolved_papers_title where id = %s;' % (i)
        # cur.execute(sql)
        # res_title = cur.fetchall()[0][0]


        if res_title and res_title == "en":  # or not res_title:

            print(i)
            print(main_link)
            print(direct_link)

            # url = _getUrl(i, "direct_link")
            # toParse = direct_link
            # paper_id = (re.findall('\d+', toParse))[0]
            url = direct_link

            while downloaded == "False" and count < 2:

                count += 1
                if count == 2:
                    file = requests.get(url)
                    open('/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/%s.pdf', 'wb').write(file.content) % (i)
                    p = True
                else:
                    if url:
                        s = DownloadPDF()
                        p = s.download(url,
                                       destination='/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/',
                                       path=str(i) + '.pdf')
                if p == True:
                    downloaded = "True"
                else:
                    # url = _getUrl(i, "main_link")
                    toParse = main_link
                    paper_id = (re.findall('\d+', toParse))[0]
                    # url = 'http://ieeexplore.ieee.org/stampPDF/getPDF.jsp?tp=&isnumber=&arnumber=%s' % (paper_id)
                    url = 'https://ieeexplore.ieee.org/stamp/stamp.jsp?tp=&arnumber=%s' % (paper_id)

            if downloaded == "True":
                # sql = "update resolved_papers2019_unique set downloaded = 1 where id = %s" % (i)
                sql = "update resolved_papers set downloaded = 1 where id = %s" % (i)

                try:
                    cur.execute(sql)
                    db.commit()
                    print("Id: %s. Downloaded: %s. Saved!" % (i, downloaded))
                except:
                    db.rollback()
            else: print("Id: %s. Downloaded: %s." % (i, downloaded))

    except UnicodeDecodeError:
        pass


    cur.close()
Пример #8
0
def downloadPDF(ids):
    # keep this connection in order to use multiprocessing
    db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
                         user="******",  # your username
                         passwd="iwJx0EAM",  # your password
                         db="clpd")  # name of the data base

    cur = db.cursor()

    try:
        i, res_title, main_link, direct_link = ids


        p = False
        downloaded = "False"
        count = 0



        # to get the title language
        # sql = 'select title_language from resolved_papers_title where id = %s;' % (i)
        # cur.execute(sql)
        # res_title = cur.fetchall()[0][0]


        if res_title and res_title == "en":  # or not res_title:

            print(i)
            print(main_link)
            print(direct_link)

            # url = _getUrl(i, "direct_link")
            url = direct_link

            while downloaded == "False" and count < 2:
                count += 1
                if url:
                    s = DownloadPDF()
                    p = s.download(url,
                                   destination='/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/',
                                   path=str(i) + '.pdf')
                if p == True:
                    downloaded = "True"
                else:
                    # url = _getUrl(i, "main_link")
                    url = main_link

            if downloaded == "True":
                # sql = "update resolved_papers2019_unique set downloaded = 1 where id = %s" % (i)
                sql = "update resolved_papers set downloaded = 1 where id = %s" % (i)

                try:
                    cur.execute(sql)
                    db.commit()
                    print("Id: %s. Downloaded: %s. Saved!" % (i, downloaded))
                except:
                    db.rollback()
            else: print("Id: %s. Downloaded: %s." % (i, downloaded))

    except UnicodeDecodeError:
        pass


    cur.close()
Пример #9
0
def _filterTitle(papers):
    # keep this connection in order to use multiprocessing
    db = pymysql.connect(host="dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
                         user="******",  # your username
                         passwd="iwJx0EAM",  # your password
                         db="clpd")  # name of the data base

    cur = db.cursor()

    try:

        id, title = papers

        threshold = 1

        # title = 'A New Approach for Cross-Language Plagiarism Analysis.'.lower()
        title = title.lower()


        k_dflanguage = 0
        k_copy = 0
        k_detection = 0

        diff_language = ["Cross-language",
                         "Crosslanguage",
                         "Cross-lingual",
                         "Crosslingual",
                         "Cross-linguistic",
                         "Crosslinguistic",
                         "Multi-language",
                         "Multilanguage",
                         "Multi-lingual",
                         "Multilingual",
                         "Multi-linguistic",
                         "Multilinguistic",
                         "Machine-translation", ]

        copy = ["Copy",
                "Duplicate",
                "Plagiarism", ]

        detection = ["Detection",
                     "Discovery", ]

        for row in diff_language:
            if row.lower() in title:
                k_dflanguage += 1

        for row in copy:
            if row.lower() in title:
                k_copy += 1

        for row in detection:
            if row.lower() in title:
                k_detection += 1

        print("diff_language: %s." % (k_dflanguage))
        print("copy: %s." % (k_copy))
        print("detection: %s." % (k_detection))

        if (k_dflanguage >= threshold or k_detection >= threshold or k_detection >= threshold):
            # papers_selected.append(id, title)

            sql = "insert into resolved_papers_selected_title values (%s)" % (id)
            print(sql)
            # try:
            cur.execute(sql)
            db.commit()
            # except:
            db.rollback()

            return True
        else:
            return False
    except:
        db.rollback()
        print('no saved')
    cur.close()
def downloadPDF(ids):
    # keep this connection in order to use multiprocessing
    db = pymysql.connect(
        host=
        "dbinstancephd.cikbkbxucwjr.us-east-2.rds.amazonaws.com",  # your host, usually localhost
        user="******",  # your username
        passwd="iwJx0EAM",  # your password
        db="clpd")  # name of the data base

    cur = db.cursor()

    try:
        i, main_link, direct_link = ids

        p = False
        downloaded = "False"
        count = 0

        print(i)
        print(main_link)
        print(direct_link)

        # url = _getUrl(i, "direct_link")
        url = direct_link

        while downloaded == "False" and count < 2:
            count += 1
            if url:
                # url = 'https://s3.amazonaws.com/academia.edu/download/30761819/book.pdf?response-content-disposition=inline%3B%20filename%3DUsing_monolingual_clickthrough_data_to_b.pdf&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWOWYYGZ2Y53UL3A%2F20190908%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20190908T222002Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=eee90437409f359612d0a47e04739fb0733d3eb347c3d6e4145596986966b26a#page=32'
                # https://s3.amazonaws.com/academia.edu.documents/30761819/book.pdf?response-content-disposition=inline%3B%20filename%3DUsing_monolingual_clickthrough_data_to_b.pdf&X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=AKIAIWOWYYGZ2Y53UL3A%2F20190908%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20190908T222002Z&X-Amz-Expires=3600&X-Amz-SignedHeaders=host&X-Amz-Signature=eee90437409f359612d0a47e04739fb0733d3eb347c3d6e4145596986966b26a#page=32
                url = "http://www.academia.edu/download/30761819/book.pdf#page=32"
                # http://www.academia.edu/download/30761819/book.pdf#page=32
                # url = 'http://google.com'
                i = 149
                destination = '/Volumes/SeagateBackupPlusDrive/CLPD2019_FULL/pdf/tocheck/'
                path = destination + str(i) + '.pdf'

                try:

                    ua = UserAgent()
                    headers = {'User-Agent': str(ua.random)}

                    r = requests.head(
                        'http://www.academia.edu/download/30761819/book.pdf#page=32',
                        allow_redirects=True)
                    print(r.url)

                    s = requests.session()

                    res = s.get(url, headers=headers, allow_redirects=False)
                    print(res.url)
                    # print(finalurl)

                    p = urlretrieve(url, path)

                    if p[1].get_content_type() == 'application/pdf':
                        downloaded = "True"
                except:
                    pass
            else:
                # url = _getUrl(i, "main_link")
                url = main_link

        if downloaded == "True":
            # sql = "update resolved_papers2019_unique set downloaded = 1 where id = %s" % (i)
            sql = "update resolved_papers set downloaded = 1 where id = %s" % (
                i)

            try:
                cur.execute(sql)
                db.commit()
                print("Id: %s. Downloaded: %s. Saved!" % (i, downloaded))
            except:
                db.rollback()
        else:
            print("Id: %s. Downloaded: %s." % (i, downloaded))

    except UnicodeDecodeError:
        pass

    cur.close()