Exemplo n.º 1
0
 def check_headline(headline):
   headline = HTMLParser().unescape(headline)
   headline = headline.lower()
   tokens = nltk.word_tokenize(headline)
   tagged = NLPParser.t3.tag(tokens)
   entities = nltk.ne_chunk(tagged, binary=True)
   parsed = NLPParser.cp.parse(entities)
   for node in parsed:
     if type(node) is nltk.Tree and node.label() == 'SBAR':
       sentence = node.leaves()
       sentence = ' '.join([t[0] for t in sentence])
       return sentence, 'sentence'
   tag_sequence = [t[1] for t in tagged]
   for node in parsed:
     if type(node) is nltk.Tree and node.label() == 'NE':
       subject = node.leaves()
       subject = ' '.join([t[0] for t in subject])
       return subject, 'topic'
   np_len = 0
   np_topic = []
   for node in parsed:
     if type(node) is nltk.Tree and node.label() == 'NP':
       np = node.leaves()
       if len(np) >= np_len and 'PP' not in np:
         np_len = len(np)
         np_topic = np
   if np_topic:
     np_topic = ' '.join([t[0] for t in np_topic])
     return np_topic, 'topic'
   else:
     for t in tag_sequence:
       if re.match(r'VB.*', t):
         return NLPParser.simple_headline(headline), 'sentence'
     return NLPParser.simple_headline(headline), 'topic'
Exemplo n.º 2
0
def uncode_name(
    name
):  # convert all the &# codes to char, remove extra-space and normalize
    from HTMLParser import HTMLParser
    name = name.replace('<![CDATA[', '').replace(']]', '')
    name = HTMLParser().unescape(name.lower())
    return name
Exemplo n.º 3
0
def cleanUpMatch(match):
    # remove everything between double line breaks
    match = re.sub(r'\n\s*\n.*?\n\s*\n', '', match, flags=re.DOTALL)

    # remove soft hyphen
    match = re.sub(r'&#xAD;(\n)*(<lb/>)*', '', match)

    # remove box character
    match = re.sub(r'\xc2\xad(\n)*(<lb/>)*', '', match)

    # replace <lb/> or '\n' with ' '
    match = re.sub(r'<lb/>|\n', ' ', match)

    # remove everything between '<>'
    match = re.sub(r'<.*?>', '', match)

    # translate special characters
    for char in TRANSLATIONS:
        match = match.replace(char, TRANSLATIONS[char])

    # remove extra white spaces
    match = ' '.join(match.split())

    # decode html characters
    match = HTMLParser().unescape(match)

    # always normalize to lower case
    return match.lower()
Exemplo n.º 4
0
    def un_code_name(name):
        """
        Convert all the &# codes to char, remove extra-space and normalize
        :param name: string to convert
        :type name: str
        :return: converted string
        """
        from HTMLParser import HTMLParser

        name = name.replace('<![CDATA[', '').replace(']]', '')
        name = HTMLParser().unescape(name.lower())
        return name
Exemplo n.º 5
0
def normalize_string(string, charset=None, replacing=False):
    """
    Decode and Convert to Unicode any string
    :param charset: encoding
    :type charset: str
    :param string: string to convert
    :type string: str or unicode
    :param replacing: Whether is ' is replaced
    :type replacing: bool
    :return: converted unicode
    :rtype: unicode
    """
    if not isinstance(string, unicode):
        try:
            if re.search(u'=[0-9a-fA-F]{2}', string):
                string = string.decode('Quoted-printable')

            string = json.loads(u'%s' % string, encoding=charset)

        except ValueError:
            try:
                string = unicode(eval(string), 'raw_unicode_escape')

            except (SyntaxError, NameError):
                string = string.decode('latin-1')
                pass

            except TypeError:
                string = unicode(string, errors='ignore')
                pass

        except LookupError:
            return u''

        except TypeError:
            string = unicode(string, errors='ignore')
            pass

    string = remove_control_chars(string)
    string = fix_bad_unicode(string)
    string = unquote(string)
    string = string.replace(u'<![CDATA[', u'').replace(u']]', u'')
    string = HTMLParser().unescape(string)
    if replacing:
        string = string.replace(u"'", '')

    string = string.lower()

    return string
Exemplo n.º 6
0
def uncodeName(name):  # Convert all the &# codes to char, remove extra-space and normalize
    from HTMLParser import HTMLParser
    name = name.replace('<![CDATA[', '').replace(']]', '')
    name = HTMLParser().unescape(name.lower())
    return name
Exemplo n.º 7
0
    def uncode_name(name):  # convert all the &# codes to char, remove extra-space and normalize
        from HTMLParser import HTMLParser

        name = name.replace("<![CDATA[", "").replace("]]", "")
        name = HTMLParser().unescape(name.lower())
        return name
Exemplo n.º 8
0
def populate_restaurants(c):
    print 'Populating Restaurants table...'

    if not (os.access('restaurants', os.R_OK)
            and os.path.isdir('restaurants')):
        print >> sys.stderr, "Error: cannot access raw data directory 'restaurants'"
        sys.exit(1)

    if not (os.access('suburbs.txt', os.R_OK)
            and os.path.isfile('suburbs.txt')):
        print >> sys.stderr, "Error: cannot access raw data file 'suburbs.txt'"
        sys.exit(1)

    #get postcodes from file and cache in dict
    suburbs = open('suburbs.txt').readlines()
    postcodes = {}
    for suburb in suburbs:
        lat, lng, pst, sub = suburb.strip().split('\t')
        postcodes[sub] = pst
    postcodes['CBD'] = 2000  #special case not in data file

    users = c.execute('SELECT username FROM Users').fetchall()
    num_users = c.execute('SELECT COUNT(*) FROM Users').fetchone()[0]

    i = 0
    for restaurant in glob.glob('restaurants/*'):
        r = open(restaurant).readlines()

        #extract info from file
        try:
            name = r[0].strip()
            name = HTMLParser().unescape(name)
            address = r[1].strip()
            address = HTMLParser().unescape(address)
            address = re.sub(r'nsw', 'NSW', address, flags=re.I)
            if not address.endswith(', NSW'):
                address = address + ', NSW'
            suburb = re.match(r'.*, (.+), Sydney', r[1]).group(1)
            suburb = HTMLParser().unescape(suburb)
            phone = r[2].strip().replace('(', '').replace(')', '')
            if re.match('Not available', phone):
                phone = 'Not provided'
            hours = r[3].strip()
            hours = re.sub(r'\s*,\s*', ', ', hours)
            hours = HTMLParser().unescape(hours)
            cuisine = r[4].strip()
            cuisine = HTMLParser().unescape(cuisine)
            cost = r[5].strip()
            image = r[6].strip()
        except:
            print >> sys.stderr, "Error: skipping '%s'" % restaurant
            continue

        #lookup postcode using suburb
        postcode = ''
        if not suburb in postcodes:
            continue
        else:
            postcode = postcodes[suburb]

        #and append it to the address
        address = address + ' ' + str(postcode)

        #chose a random protocol for the website
        protocol = 'http://'
        if random.randint(0, 1) == 1:
            protocol = 'https://'

        #make site of the form protocol://www.lowercase.name.of.restaurant.fake.com
        website = name.replace('  ', ' ').replace(' ', '.').replace(
            '-', '').strip() + '.fake.com'
        website = HTMLParser().unescape(website)
        website = urllib.quote(website)  #encode as url
        website = protocol + 'www.' + website  #avoid encoding the protocol
        website = website.lower().replace('..', '.')

        #ensure only some restaurants have owners
        owner = None
        if random.randint(0, 3) == 0:
            owner = users[random.randint(0, num_users - 1)][0]

        i += 1
        data = (i, name, suburb, address, postcode, phone, hours, cuisine,
                owner, website, cost, image)
        c.execute(
            '''INSERT INTO Restaurants
				(id, name, suburb, address, postcode, phone, hours, cuisine, owner, website, cost, image)
				VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)''', data)
Exemplo n.º 9
0
f = codecs.open('vk.txt', mode = 'a', encoding= 'utf-8')
f.write('--------------------\n')
f.write(u'количество клубов - ' + str(quantity_of_clubs) + u'\n')

i = 1
while i <= quantity_of_clubs:
    club_number = randrange(1000000, 30000000)
    if club_number not in visited_clubs:
        visited_clubs.add(club_number)
        data = urlopen('http://vk.com/club%s' % club_number).read().decode('utf8')
        title = HTMLParser().unescape(re.findall('<title>(.*)</title>', data)[0])
        if title not in ([u'Ошибка', u'Частная группа']):
            quantity = HTMLParser().unescape(re.findall(u'Участники\s+<em class="pm_counter">(\d+)</em>',data))
            if quantity:
                quantity_of_members.append(int(quantity[0]))
                words = title.lower().split(' ')
                for word in words:
                    # отбрасываем слова единичной длины и слова из списка
                    if len(word) != 1 and word not in words_to_exclude:
                        new_word = rx3.sub(u'-', (rx2.sub(u'', rx1.sub(u'',word)))) # удаляем все кроме букв и '-'
                        # отбрасываем пустые слова, слова единичной длины и слова из списка
                        if new_word != '' and len(new_word) != 1 and new_word not in words_to_exclude:
                            words_in_club_titles.append(new_word)
                i +=1


counts = Counter(words_in_club_titles)
top = counts.most_common(10)
for element in top:
    print "'%s', %d " %(element[0], element[1])
    f.write("'%s', %d " %(element[0], element[1]))
Exemplo n.º 10
0
i = 1
while i <= quantity_of_clubs:
    club_number = randrange(1000000, 30000000)
    if club_number not in visited_clubs:
        visited_clubs.add(club_number)
        data = urlopen('http://vk.com/club%s' %
                       club_number).read().decode('utf8')
        title = HTMLParser().unescape(
            re.findall('<title>(.*)</title>', data)[0])
        if title not in ([u'Ошибка', u'Частная группа']):
            quantity = HTMLParser().unescape(
                re.findall(u'Участники\s+<em class="pm_counter">(\d+)</em>',
                           data))
            if quantity:
                quantity_of_members.append(int(quantity[0]))
                words = title.lower().split(' ')
                for word in words:
                    # отбрасываем слова единичной длины и слова из списка
                    if len(word) != 1 and word not in words_to_exclude:
                        new_word = rx3.sub(
                            u'-', (rx2.sub(u'', rx1.sub(
                                u'', word))))  # удаляем все кроме букв и '-'
                        # отбрасываем пустые слова, слова единичной длины и слова из списка
                        if new_word != '' and len(
                                new_word
                        ) != 1 and new_word not in words_to_exclude:
                            words_in_club_titles.append(new_word)
                i += 1

counts = Counter(words_in_club_titles)
top = counts.most_common(10)