Пример #1
0
    def __init__(self,
                 url='',
                 username='',
                 userTitle='',
                 id='',
                 name='',
                 avatar='',
                 link='',
                 score=0.0,
                 feedbackOverall=0,
                 feedbackSummaryList=[],
                 feedbackRatings=[],
                 reviews=[],
                 lastUpdate=0):

        self.url = url
        self.url = LinksHelper.fix_url(self.url)
        self.username = username

        self.userTitle = userTitle
        self.id = id
        self.name = name
        self.avatar = avatar
        self.link = link

        self.score = score

        self.feedbackOverall = feedbackOverall
        self.feedbackSummaryList = feedbackSummaryList

        self.feedbackRatings = feedbackRatings

        self.reviews = reviews
        self.lastUpdate = lastUpdate
    def __init__(self, url, type, originalId, id, author, categories, title,  description, images,
                 timeLeft, price, details, date, ratings, shipping, reviews, lastUpdate ):

        self.url = url
        self.url = LinksHelper.fix_url(self.url)

        self.type = type
        self.originalId = originalId
        self.id = id

        self.author = author
        self.categories = categories

        self.title = title
        self.timeLeft = timeLeft

        self.details = details

        self.description = description
        self.images = images

        self.price = price

        self.date = date
        self.ratings = ratings

        self.reviews = reviews
        self.lastUpdate = lastUpdate

        self.shipping = shipping
Пример #3
0
    def __init__(self,
                 url,
                 authorUsername,
                 authorFullName,
                 date,
                 score,
                 title,
                 body,
                 purchased,
                 thumbsUp,
                 thumbsDown,
                 lastUpdate=''):

        self.url = url
        self.url = LinksHelper.fix_url(self.url)

        self.authorUsername = authorUsername
        self.authorFullName = authorFullName
        self.date = date

        self.title = title
        self.body = body
        self.purchased = purchased
        self.thumbsUp = thumbsUp
        self.thumbsDown = thumbsDown

        self.score = score

        self.lastUpdate = lastUpdate
Пример #4
0
    def findLJSONObjectAlready(website,
                               url='',
                               title='',
                               description='',
                               allowTitleIncluded=False):
        url = LinksHelper.fix_url(url)

        global arrJSONObjects

        if (website in arrJSONObjects) == False:
            if JSONDB.readJSONObjectsFiles(website) == False:
                print("IT DOESNT WORK....")
                return None

        list = arrJSONObjects[website]

        if list is not None:
            for object in list:

                # print(list)
                # print("findObject", object, hasattr(object, 'title'))
                print(object.title, title, title == object.title)

                if ((hasattr(object, 'url'))and(url == object.url)) or \
                   ((title != '') and (hasattr(object, 'title')) and (title == object.title)) or \
                   ((description != '') and (hasattr(object, 'description')) and (description == object.description)):
                    return object

                if allowTitleIncluded and (title in object.title
                                           or object.title in title):
                    return object

        return None
    def __init__(self, url, type, id, title, parent):

        self.url = url
        self.url = LinksHelper.fix_url(self.url)

        self.type = type
        self.id = id
        self.title = title
        self.parent = parent
Пример #6
0
    def parse(self, response):

        url = response.url

        if self.testingURL != '':
            response = LinksHelper.getRequestTrials(self.session,
                                                    self.testingURL, {}, {},
                                                    maxTrials=5)
            html = response.text
            response = Selector(text=html)
            url = self.testingURL

        self.parseResponse(response, url)

        print(url, "data", response, self.ONLY_ONE_PAGE)

        if self.ONLY_ONE_PAGE == False:

            for next_page in response.css('a'):

                next_page = self.extractFirstElement(next_page.xpath('@href'))

                sharpIndex = next_page.find('#')
                if sharpIndex >= 0:
                    next_page = next_page[0:sharpIndex]

                parsed_url = urlparse(next_page)

                if bool(parsed_url.scheme) == False:
                    newUrl = self.url.rstrip('/')
                    newUrl = newUrl.rstrip('/')
                    next_page = newUrl + next_page

                # print(next_page)

                if self.MAXIMUM_NUMBER_PAGES == 0 or len(
                        self.linksQueue) < self.MAXIMUM_NUMBER_PAGES:
                    self.addLink(next_page)

            try:

                while self.linksQueueIndex < len(self.linksQueue):

                    self.linksQueueIndex += 1

                    # print("self.linksQueue", len(self.linksQueue) )
                    # print("self.linksQueue[self.linksQueueIndex]", self.linksQueue[self.linksQueueIndex])

                    yield scrapy.Request(
                        url=self.linksQueue[self.linksQueueIndex - 1],
                        callback=self.parse)

            except ValueError:
                pass
Пример #7
0
    def getAddress(city, country):
        address = city + ' ' + country
        address = address.replace(' ', '+')

        session = requests.Session()
        result = LinksHelper.getRequestTrials(session, 'https://maps.google.com/maps/api/geocode/json?address='+address+'&sensor=false', {}, {}, maxTrials=5)
        result = result.json()
        if len(result['results']) > 0:
            result =result['results'][0]['geometry']['location']
        else:
            result = None
        return result
Пример #8
0
    def postAddForum(rootURL, url, user, parentId, name, title, description, iconPic, coverPic, arrKeywords = [],  dtOriginalDate = None, country='', city='', language='',  latitude=-666, longitude=-666):

        user = ServerAPI.loginUser(user)

        if user is None: return False

        title = LinksHelper.fixArchiveStrings(title)
        description = LinksHelper.fixArchiveStrings(description)
        iconPic = LinksHelper.fixArchiveStrings(iconPic)
        coverPic = LinksHelper.fixArchiveStrings(coverPic)

        description = LinksHelper.fix_relative_urls(description, rootURL)

        rez = ServerAPI.processLocation(country, city, language, latitude, longitude)
        latitude = rez[0]
        longitude = rez[1]

        arrAdditionalInfo = {
            'scraped':True,
            'source': {
                'page': url,
                'website': rootURL,
            }
        }

        if dtOriginalDate is not None: arrAdditionalInfo['dtOriginal'] = dtOriginalDate

        if isinstance(arrKeywords, str): keywords = arrKeywords
        else: keywords = ','.join(str(e) for e in arrKeywords)

        data = {
            'id': user['id'],
            'sessionId': user['sessionId'],

            'parent': parentId,
            'title': title,
            'name': name,
            'description': description,
            'iconPic': iconPic,
            'coverPic': coverPic,
            'keywords': keywords,
            'country': country,
            'city': city,
            'language': language,
            'latitude': latitude,
            'longitude': longitude,
            'additionalInfo': ujson.dumps(arrAdditionalInfo)
        }

        headers = {}

        result = LinksHelper.getRequestTrials(session, SERVER_URL + "forums/add-forum", data, headers, maxTrials = 5)
        result = result.json()
        #print(result)
        if result['result'] == True:
            print('FORUM new ', result['forum']['URL'])
            return result['forum']['id']
        else:
            print("ERROR adding new forum ", result)
            return None
Пример #9
0
    def postAddReply(rootURL, user, parentId, parentReplyId, title, description, arrKeywords = [], arrAttachments=[], dtOriginalDate = None, country='', city='', language='',  latitude=-666, longitude=-666, authorName='', authorAvatar='' ):

        user = ServerAPI.loginUser(user)

        if user is None: return False
        if parentId is None: return False
        if parentReplyId is None: parentReplyId = ""

        title = LinksHelper.fixArchiveStrings(title)
        description = LinksHelper.fixArchiveStrings(description)
        authorAvatar = LinksHelper.fixArchiveStrings(authorAvatar)
        authorName = LinksHelper.fixArchiveStrings(authorName)

        description = LinksHelper.fix_relative_urls(description, rootURL)

        rez = ServerAPI.processLocation(country, city, language, latitude, longitude)
        latitude = rez[0]
        longitude = rez[1]

        arrAdditionalInfo = {
            'scraped':True,
        }

        if dtOriginalDate is not None: arrAdditionalInfo['dtOriginal'] = dtOriginalDate
        if authorName != '': arrAdditionalInfo['orgName'] = authorName
        if authorAvatar != '': arrAdditionalInfo['orgAvatar'] = authorAvatar

        if isinstance(arrKeywords, str): keywords = arrKeywords
        else: keywords = ','.join(str(e) for e in arrKeywords)

        data = {
            'id': user['id'],
            'sessionId': user['sessionId'],

            'parent': parentId,
            'parentReply': parentReplyId,
            'title': title,
            'description': description,
            'keywords': keywords,
            'attachments': ujson.dumps(arrAttachments),
            'country': country,
            'city': city,
            'language': language,
            'latitude': latitude,
            'longitude': longitude,
            'additionalInfo': ujson.dumps(arrAdditionalInfo)
        }

        headers = {}

        result = LinksHelper.getRequestTrials(session, SERVER_URL + "replies/add-reply", data, headers, maxTrials = 5)
        result = result.json()

        #print(result)
        if result['result'] == True:
            print('reply new ', result['reply']['URL'])
            return result['reply']['id']
        else:
            print("ERROR adding new reply ",result)
            return None
Пример #10
0
    def addLinkVisited(website, url):
        url = LinksHelper.fix_url(url)

        if LinksDB.checkLinkVisitedAlready(website, url) == True:
            return False

        global arrLinksVisited

        if (website in arrLinksVisited) == False:
            if LinksDB.readLinksVisitedFiles(website) == False:
                arrLinksVisited[website] = []

        arrLinksVisited[website].append(url)
Пример #11
0
    def checkLinkVisitedAlready(website, url):
        url = LinksHelper.fix_url(url)

        global arrLinksVisited

        if (website in arrLinksVisited) == False:
            if LinksDB.readLinksVisitedFiles(website) == False:
                return False

        list = arrLinksVisited[website]
        if list is not None:
            if url in list:
                return True

        return False
Пример #12
0
    def processURL(self, initialURL, timestamp, endtimestamp):
        # https://web.archive.org/web/20130502222444/http://hackpedia.info/viewtopic.php?f=43&t=16653&p=116862&sid=2a60dce4bac29bf5b399c5741f4e5cb3

        #initialURL = "http://hackpedia.info/viewtopic.php?f=14&t=14764&sid=97ffaea0727ec816f88a27e7a6778587"

        for rejection in self.crawler.rejectionSubstr:
            if rejection in initialURL:
                return None

        url = "http://web.archive.org/web/" + endtimestamp + "/" + initialURL

        data = {}
        headers = {}

        response = LinksHelper.getRequestTrials(self.session,
                                                url,
                                                data,
                                                headers,
                                                maxTrials=5)

        # html = html.content
        # html = html.decode("utf-8")
        html = response.text

        #print(html)
        #print(type(html))

        sel = Selector(text=html)

        date = timestamp
        date = date[:4] + '-' + date[4:]
        date = date[:6 + 1] + '-' + date[6 + 1:]
        date = date[:8 + 2] + ' ' + date[8 + 2:]
        date = date[:10 + 3] + ':' + date[10 + 3:]
        date = date[:12 + 4] + ':' + date[12 + 4:]

        self.crawler.date = date
        self.crawler.parseResponse(sel, initialURL)
Пример #13
0
    def findLinkObjectAlready(website, url='', title='', description='', allowTitleIncluded=False, similarity=False):
        url = LinksHelper.fix_url(url)

        global arrLinksObjects

        if (website in arrLinksObjects) == False:
            if LinksDB.readLinkObjectsFiles(website) == False:
                print("IT DOESNT WORK....")
                return None

        list = arrLinksObjects[website]

        if list is not None:
            for object in list:

                # rint(list)
                # print("findObject", object, hasattr(object, 'title'))
                # print( object.title, title, title == object.title)

                if ((hasattr(object, 'url'))and(url == object.url)) or \
                   ((title != '') and (hasattr(object, 'title')) and (title == object.title)) or \
                   ((description != '') and (hasattr(object, 'description')) and (description == object.description)):
                    return object

                if similarity:
                    if title != '' and hasattr(object, 'title'):
                        if SequenceMatcher(None, title, object.title).ratio() >= 0.7:
                            return object

                    if description != '' and hasattr(object, 'description'):
                        if SequenceMatcher(None, description, object.description).ratio() >= 0.7:
                            return object


                if allowTitleIncluded and (title in object.title or object.title in title):
                    return object

        return None
Пример #14
0
    def postAddProduct(rootURL, url, user, parentId, title, description, shortDescription='', arrKeywords=[], arrAttachments=[],
                       dtOriginalDate=None, country='', city='', language='', latitude=-666, longitude=-666,
                       itemId='', author=None, timeLeft=0, details=None, price=None, ratingScoresList=None, shipping=None, reviewsList=None, lastUpdate=''):

        user = ServerAPI.loginUser(user)

        if user is None: return False

        title = LinksHelper.fixArchiveStrings(title)
        description = LinksHelper.fixArchiveStrings(description)
        shortDescription = LinksHelper.fixArchiveStrings(shortDescription)
        authorAvatar = LinksHelper.fixArchiveStrings(author.avatar)
        authorName = LinksHelper.fixArchiveStrings(author.username)

        description = LinksHelper.fix_relative_urls(description, rootURL)

        rez = ServerAPI.processLocation(country, city, language, latitude, longitude)
        latitude = rez[0]
        longitude = rez[1]

        arrAdditionalInfo = {
            'scraped': True,
            'source': {
                'page': url,
                'website': rootURL,
            },

            'itemId': itemId,
            'timeLeft': timeLeft,
        }

        if dtOriginalDate is not None: arrAdditionalInfo['dtOriginal'] = dtOriginalDate
        if authorName != '': arrAdditionalInfo['orgName'] = authorName
        if authorAvatar != '': arrAdditionalInfo['orgAvatar'] = authorAvatar

        if isinstance(arrKeywords, str):
            keywords = arrKeywords
        else:
            keywords = ','.join(str(e) for e in arrKeywords)

        data = {
            'id': user['id'],
            'sessionId': user['sessionId'],

            'parent': parentId,
            'title': title,
            'description': description,
            'shortDescription': shortDescription,
            'keywords': keywords,
            'attachments': ujson.dumps(arrAttachments),
            'country': country,
            'city': city,
            'language': language,
            'latitude': latitude,
            'longitude': longitude,
            'additionalInfo': ujson.dumps(arrAdditionalInfo),

             # additional product information

            'author': ujson.dumps(author.getJSON()),
            'details': ujson.dumps(details.getJSON()),
            'price': ujson.dumps(price.getJSON()),
            'ratingScoresList': ujson.dumps(ratingScoresList.getJSON()),
            'shipping': ujson.dumps(shipping.getJSON()),
            'reviewsList': ujson.dumps(reviewsList.getJSON()),
            'lastUpdate': lastUpdate
        }

        headers = {}

        result = LinksHelper.getRequestTrials(session, SERVER_URL + "topics/add-topic", data, headers, maxTrials = 5)
        result = result.json()

        # print(result)
        if result['result'] == True:

            print('topic new ', result['topic']['URL'])
            return result['topic']['id']
        else:
            print("ERROR adding new topic ", result)
            return None