示例#1
0
def strip_unicode(text):
    clean = ftfy.fix_text(ftfy.fix_encoding(text))
    temp = ""
    try:
        temp = clean.decode('utf-8', 'ignore').encode('utf-8')
    except UnicodeEncodeError or UnicodeEncodeError as e:
        print "error1" + str(e)
        try:
            temp = text.encode('utf-8').decode('utf-8', 'ignore').encode('utf-8')
        except UnicodeEncodeError or UnicodeDecodeError as e:
            print "error2" + str(e)
    return temp
示例#2
0
def strip_unicode(text):
    clean = ftfy.fix_text(ftfy.fix_encoding(text))
    try:
        temp = clean.decode('utf-8', 'ignore').encode('utf-8')
    except UnicodeEncodeError or UnicodeEncodeError as e:
        # print "ERROR ONE"
        pass
    try:
        # print "first encoding failed, trying other way!"
        temp = text.encode('utf-8').decode('utf-8', 'ignore').encode('utf-8')
    except UnicodeEncodeError or UnicodeDecodeError as e:
        # print "ERROR TWO"
        pass
    return temp
示例#3
0
def fetch():
    all_prices, timestamps = {}, []
    for i, url in enumerate(configuration.get('cardhoarder_urls')):
        s = fetcher_internal.fetch(url)
        s = ftfy.fix_encoding(s)
        timestamps.append(dtutil.parse_to_ts(s.split('\n', 1)[0].replace('UPDATED ', ''), '%Y-%m-%dT%H:%M:%S+00:00', dtutil.CARDHOARDER_TZ))
        all_prices[i] = parse_cardhoarder_prices(s)
    url = configuration.get('mtgotraders_url')
    if url:
        s = fetcher_internal.fetch(configuration.get('mtgotraders_url'))
        timestamps.append(dtutil.dt2ts(dtutil.now()))
        all_prices['mtgotraders'] = parse_mtgotraders_prices(s)
    if not timestamps:
        raise TooFewItemsException('Did not get any prices when fetching {urls} ({all_prices})'.format(urls=configuration.get('cardhoarder_urls') + [configuration.get('mtgotraders_url')], all_prices=all_prices))
    store(min(timestamps), all_prices)
示例#4
0
def tokenizer_word(data_dir, category, save_dir, sw_dir):
    list_fn = os.listdir(os.path.join(data_dir, category))
    file_train = os.path.join(save_dir, "Train", "{}.txt".format(category))
    file_test = os.path.join(save_dir, "Test", "{}.txt".format(category))

    # lay stopwords tu file
    with open(sw_dir, "r") as f:
        stopwords = f.readlines()
    for i in range(len(stopwords)):
        stopwords[i] = stopwords[i].strip()

    # tao train file, test file theo chu de
    with open(file_train, "w", encoding="utf8"):
        pass
    with open(file_test, "w", encoding="utf8"):
        pass

    # chia train, test (60/40)
    mid = int(len(list_fn) * 0.6 + 0.5)

    for i, fn in enumerate(list_fn):
        fl = os.path.join(data_dir, category, fn)
        with open(fl, "r") as f:
            content = f.readlines()
        content = " ".join(content)
        content = content.replace("\n", " ")
        content = fix_encoding(content)
        content = ViTokenizer.tokenize(content)
        content = gensim.utils.simple_preprocess(content)
        content = [w for w in content if not w in stopwords]
        content = " ".join(content)
        if len(content) < 100:
            continue
        if i < mid:
            with open(file_train, "a", encoding="utf8") as f:
                f.write(content)
                f.write("\n")
        elif i == mid:
            with open(file_train, "a", encoding="utf8") as f:
                f.write(content)
        else:
            if i == (len(list_fn) - 1):
                with open(file_test, "a", encoding="utf8") as f:
                    f.write(content)
            else:
                with open(file_test, "a", encoding="utf8") as f:
                    f.write(content)
                    f.write("\n")
示例#5
0
def get_results_from_lines(lines, locale_template):
    # lines = []
    all_results = []
    to_check_items = list(locale_template.items())
    already_check_items = list()
    value = {}
    prev_field = None
    for line in lines:
        line = ftfy.fix_encoding(line)
        found_value, value_new_entries = parse_line(line, to_check_items)
        if found_value:
            field, expression = found_value

            if field == "matrix_sub_value":
                value_new_entries = {"{}{}".format(prev_field, k): v for k, v in value_new_entries.items()}

            value.update(value_new_entries)
            prev_field = field
            if field == "end_of_file":
                if value:
                    result = seal_result(value, to_check_items)
                    all_results.append(result)
                to_check_items = reset_check_items(locale_template)
                value = {}

            if field in permanent_matches:
                continue

            already_check_items.append(found_value)
            to_check_items.remove(found_value)
            if len(to_check_items) <= len(permanent_matches):
                # I don't need to check anyother field
                # print("No other field to check for filename {}", filename)
                all_results.append(seal_result(value, to_check_items))
                value = {}
        else:
            # potentially the line is unwanted so we need to reset the items and add results
            found_value, value_new_entries = parse_line(line, already_check_items)
            if found_value:
                # Something wrong in the format of the file
                # try create a new value starting from this line
                all_results.append(seal_result(value, to_check_items))
                value = value_new_entries
                to_check_items = reset_check_items(locale_template)
                to_check_items.remove(found_value)

    # This is the end of the file
    return all_results
示例#6
0
def fetch_content(request_url, extractor):
    meta = {}

    # fetch using python
    url, response = fetch_url(request_url)

    if response is not None:
        # extract content
        el, txt, meta = extractor.extract(response, url)

        if txt is not None and txt != '':
            txt = fix_encoding(txt)
            return url, response, txt, meta

    # give up
    return url if url else request_url, None, None, meta
示例#7
0
    def _sanitize_story_line(line):
        line = ftfy.fix_encoding(line)

        sentence_endings = [
            ".", "!", "?", "...", "'", "`", '"', ")", "\u2019", "\u201d"
        ]

        # CNN stories always start with "(CNN)"
        if line.startswith("(CNN)"):
            line = line[len("(CNN)"):]

        # Highlight are essentially bullet points and don't have proper sentence endings
        if line[-1] not in sentence_endings:
            line += "."

        return line
示例#8
0
    def getDatasetList(self):
        response = requests.get(self.reqUrl)
        linkList = BeautifulSoup(response.text, "html.parser").find_all('a')
        fixedList = list()
        datasetList = list()

        for link in linkList:
            fixedList.append(fix_encoding(link.get('href')))

        for item in fixedList:
            if item.find('Bases') != -1:
                fixedItem = re.sub('Bases/|.xlsx', '', item)
                datasetList.append(fixedItem)

        response.close()
        return datasetList
示例#9
0
    def _fetch_chart(self):
        req = urllib.request.Request(self.url)
        page = urllib.request.urlopen(req, data=None, timeout=15)

        root = lxml.html.parse(page)

        rank = 1
        entry = None

        for element in root.iter(tag=lxml.etree.Element):
            cls = element.get('class')

            if cls is None:
                continue

            if cls == 'wrap_rank':
                entry = ChartEntry()

                entry.rank = rank

                if rank > self.limit:
                    break
                else:
                    self.append(entry)
                    rank += 1

                entry.change = element[0].get('class').replace(
                    'icon_', '').replace('static',
                                         'none').replace('rank_', '')

                if entry.change is not 'new' and len(element) >= 2:
                    entry.change_diff = element[1].text_content().strip()

            if cls == 'wrap_song_info' and entry is not None:
                next = False
                for a in element.iter(tag='a'):
                    if not next:
                        entry.title = ftfy.fix_encoding(
                            a.text_content().strip())
                        next = True
                    else:
                        for artist in a.text_content().split('|')[0].replace(
                                ' & ', ',').split(','):
                            entry.artists.append(Artist(artist.strip()))

                        break
示例#10
0
def get_articles(id_site, interval_start, interval_stop):
    query = "SELECT id_article, xmltext FROM Article WHERE id_site = {} AND stimestamp between (1<<31)-unix_timestamp()+3600*24*{} AND (1<<31)-unix_timestamp()+3600*24*{}".format(
        id_site, interval_stop, interval_start)
    connection = pymysql.connect(host=CONNECTION_DETAILS["host"],
                                 port=CONNECTION_DETAILS["port"],
                                 user=CONNECTION_DETAILS["user"],
                                 passwd=CONNECTION_DETAILS["passwd"],
                                 db=CONNECTION_DETAILS["db"],
                                 charset=CONNECTION_DETAILS["charset"])
    cur = connection.cursor(pymysql.cursors.DictCursor)
    cur.execute(query)
    list_of_articles = []
    for line in cur.fetchall():
        text = ftfy.fix_encoding(line["xmltext"])
        plain_text = remove_html_tags(text)
        out = plain_text.translate(str.maketrans("", "", string.punctuation))
        list_of_articles.append(out)
    return list_of_articles
示例#11
0
def clean_text(x):
    '''General purpose text cleaner that reads in a string, applies Unicode sandwhich, strips special characters, and returns a list of tokens.'''

    #Fix encoding in case there is a problem
    x = ftfy.fix_encoding(x)
    #Unicode Sandwhich...
    x = x.encode('UTF-8')  #encode as UTF-8
    input_code = chardet.detect(bytes(x))['encoding']  #Auto-detect encoding
    u_string = x.decode(input_code)  #Decode
    re.sub(r'\w{4}', u'xxxx', u_string,
           flags=re.UNICODE)  #Handle funny unicode artifacts
    x = u_string.encode('UTF-8')  #Encode back to UTF-8
    x = str(x)  #Convert to string

    x = re.sub(r'[\[\]\"\'\,]', ' ', x)  #Remove backslashes
    x = re.sub(r'[^\w\s]', ' ', x)  #Insert space after end of each sentence
    x = x.split()  #split string to list
    return x
示例#12
0
def chat():
    print("Enter your question and (quit) to stop!")
    while True:
        inp = input("You: ")
        if inp.lower() == "quit":
            break

        results = model.predict([bag_of_words(inp, words)])
        #print(bag_of_words(inp, words))
        results_index = numpy.argmax(results)
        #print(results)
        tag = labels[results_index]

        for tg in data["intents"]:
            if tg['tag'] == tag:
                responses = tg["responses"]
                print(fix_encoding(random.choice(responses)))
                break
示例#13
0
    def _fetch_chart(self):
        req = urllib.request.Request(self.url)
        page = urllib.request.urlopen(req, data=None, timeout=15)

        root = lxml.html.parse(page)

        rank = 1
        entry = None

        for element in root.iter(tag=lxml.etree.Element):
            cls = element.get('class')

            if cls is None:
                continue

            if cls == 'ranking':
                entry = ChartEntry()

                entry.rank = element.text_content().strip()

                if rank > self.limit:
                    break
                else:
                    self.append(entry)
                    rank += 1

            if cls == 'change':
                change = element[0].get('class')
                change = change if change else 'none'
                entry.change = change

                change_diff = element.text_content().strip()

                if (change == 'up'
                        or change == 'down') and change_diff != 'HOT':
                    entry.change_diff = change_diff

            if cls == 'subject':
                entry.title = ftfy.fix_encoding(
                    element[0].text_content().strip())

                for artist in element[1].text_content().split('|')[0].replace(
                        ' & ', ',').split(','):
                    entry.artists.append(Artist(artist.strip()))
示例#14
0
def get_in4_from_positon(i):
    data = pd.read_csv("static/data_img/data.csv")
    name_product = list(data['name_product'])
    link = list(data['link'])
    price = list(data['price'])
    try:
        fix_name = fix_encoding(name_product[i])
        name_product[i] = fix_name
    except:
        None
    return name_product[i], link[i], str(f'{int(price[i]):n}') + ' Đồng'


# i = get_locate_image('static/data_img/image/1588_00002.jpg')
# data = pd.read_csv("static/data_img/data.csv")
# name_img = list(data['name_img'])

# a, b, c = get_in4_from_positon(0)
# print(a, b, c)
示例#15
0
def run() -> None:
    files = rotation.files()
    n = len(files)
    time_until = TIME_UNTIL_ROTATION - datetime.timedelta(weeks=1)
    if n >= rotation.TOTAL_RUNS:
        print(
            'It is the moment of discovery, the triumph of the mind, and the end of this rotation.'
        )
        return

    if n == 0 and TIME_UNTIL_ROTATION > datetime.timedelta(7):
        print(
            'The monks of the North Tree rarely saw their kodama until the rotation, when it woke like a slumbering, angry bear.'
        )
        print('ETA: {t}'.format(
            t=dtutil.display_time(int(time_until.total_seconds()))))
        return

    if n == 0:
        rotation.clear_redis(clear_files=True)
    #else:
    #    rotation.clear_redis()

    all_prices = {}
    for url in configuration.get_list('cardhoarder_urls'):
        s = fetch_tools.fetch(url)
        s = ftfy.fix_encoding(s)
        all_prices[url] = parse_cardhoarder_prices(s)
    url = configuration.get_str('mtgotraders_url')
    if url:
        s = fetch_tools.fetch(url)
        all_prices['mtgotraders'] = parse_mtgotraders_prices(s)

    run_number = process(all_prices)
    if run_number == rotation.TOTAL_RUNS:
        make_final_list()

    try:
        url = f'{fetcher.decksite_url()}/api/rotation/clear_cache'
        fetch_tools.fetch(url)
    except Exception as c:  # pylint: disable=broad-except
        print(c)
示例#16
0
def processPage(page):
    title = page['title']
    if title not in page_uuids:
        return

    uuid = page_uuids[title]

    children = []

    if 'children' in page.keys():
        for child in page['children']:
            children.append({
                'html':
                applyHeading(renderMarkdown(fix_encoding(child['string'])),
                             child) + renderBullets(child)
            })

    template_data = {
        'title': renderMarkdown(title, ignoreLinks=True),
        'blocks': children,
        'uuid': uuid,
        'references': []
    }

    global _linksTo

    for item in _linksTo:
        item['link_from'] = uuid
        item['title'] = renderMarkdown(title, ignoreLinks=True)
        item['text'] = renderMarkdown(item['text'], ignoreLinks=True)

        #if item['uuid'] == uuid:
        #    continue

        if item['link_to'] in references.keys():
            references[item['link_to']].append(item)
        else:
            references[item['link_to']] = [item]

    _linksTo = []

    page_data[title] = template_data
示例#17
0
def fix_characters_in_string(text):
    """
    Removes control characters such as \r\n \x1b \ufffd from string impl and returns a unicode
    string where all control characters have been replaced by a space.
    :param text: expects a unicode string
    :return: unicode string
    """

    # deal with encoding
    new_text = fix_encoding(text)

    # remove unicode characters from "Specials" block
     # see: https://www.compart.com/en/unicode/block/U+FFF0
    new_text = re.sub(r"\\ufff.", " ", new_text.encode("unicode-escape"))

    # remove all kinds of control characters and emojis
    # see: https://www.fileformat.info/info/unicode/category/index.htm
    new_text = u"".join(ch if unicodedata.category(ch)[0] != "C" else " " for ch in new_text.decode("unicode-escape"))

    return new_text
示例#18
0
def ocr(filename):
    """
    This function will handle the core OCR processing of images.
    """
    i = cv2.imread(filename)

    # Convert to gray
    i = cv2.cvtColor(i, cv2.COLOR_BGR2GRAY)

    # Apply dilation and erosion to remove some noise
    kernel = np.ones((1, 1), np.uint8)
    i = cv2.dilate(i, kernel, iterations=1)
    i = cv2.erode(i, kernel, iterations=1)

    text = pytesseract.image_to_string(i)
    #return text

    # Cleaning all the gibberish text
    text = ftfy.fix_text(text)
    text = ftfy.fix_encoding(text)
    return text
示例#19
0
def process_target_tables(tar_tables, doc):
    # only the first 3 tables are of interest
    for x in range(0, 3):
        tablenum = x
        tr_list = tar_tables[x].find_all("tr")
        print("--- Row list length :" + str(len(tr_list)))
        # time.sleep(1)
        # tr_list[0] is just the column headers... can be ignored
        for y in range(1, len(tr_list)):
            curr_rowdata = tr_list[y].find_all("td")
            print(curr_rowdata)
            print("^^^^^^^^^")
            tar_info = process_row_info(curr_rowdata, tablenum)

            # exceptions for wonky seasonal table
            # if any of these trigger, it's an empty data thing to be ignored
            try:
                if (tar_info[1].strip()) == "":
                    pass
                    # print("Detected nothing at idx 1!")
                elif (tar_info[2].strip()) == "":
                    pass
                    # print("Detected nothing at idx 2!")
                elif (tar_info[3].strip()) == "":
                    pass
                    # print("Detected nothing at idx 3!")
                else:
                    for z in tar_info:
                        try:
                            purified_line = ftfy.fix_encoding(z)
                            fixed_line = ftfy.fix_text(purified_line)
                            doc.add_paragraph(fixed_line)
                            # print(x.encode("ascii", "replace"))
                            # run printing operation here
                        except:
                            print("Check for line info processing error...")
                            time.sleep(1)
            except:
                pass
    print("[+] Page processing complete! ---")
def clean_string(s):
    s = str(s)
    if isnull(s):
        return None
    elif re.search('[a-zA-Z]', s) is None:
        return None
    else:
        s = remove_bom(s)
        s = remove_control_chars(s)
        s = fix_encoding(s)
        s = fix_text(s)
        s = fix_partial_utf8_punct_in_1252(s)
        s = decode_escapes(s)
        s = fix_latin_ligatures(s)
        s = uncurl_quotes(s)
        s = s.replace("Äu0087", "ć")
        s = s.replace("Äu0090", "Đ")
        s = s.replace("Ãu0096", "Ö")
        s = s.replace("Åu008D", "ō")

        s = s.replace("\\", " ")
        s = s.replace("/", " ")
        s = s.replace("ö", "ö")

        p = re.compile("^\w+[A-Z]{1}\w*$")
        if p.search(s):
            # From: https://stackoverflow.com/a/37697078
            s = re.sub('(?!^)([A-Z][a-z]+)', r'\1', s)

        new_string = ""
        p = False
        for letter in s:
            if letter in "([":
                p = True
            elif letter in ")]":
                p = False
                continue
            if not p:
                new_string += letter
        return new_string.strip()
示例#21
0
def clean_data(x,
               rem_stopwords=True,
               ner=True,
               rem_punc=True,
               lemmatize=False):
    '''Text cleaner that applies preprocessing rules to text. Preprocessing options include, ner, stop word removal, lemmatization, html cleaning, and punctuation removal.'''

    #Unicode Sandwhich
    #Fix encoding in case there is a problem
    x = ftfy.fix_encoding(x)
    #Unicode Sandwhich...
    x = x.encode('UTF-8')  #encode as UTF-8
    input_code = chardet.detect(bytes(x))['encoding']  #Auto-detect encoding
    try:
        u_string = x.decode(input_code)  #Decode
    except:
        u_string = x.decode('UTF-8')
    re.sub(r'\w{4}', u'xxxx', u_string,
           flags=re.UNICODE)  #Handle funny unicode artifacts
    x = u_string.encode('UTF-8')  #Encode back to UTF-8
    x_clean = str(x)  #Convert to string

    if ner == True:
        x_clean = capitalize_named_entities(x_clean)

    if rem_stopwords == True:
        x_clean = remove_stopwords(x_clean)

    if lemmatize == True:
        x_clean = lemmatize_text(x_clean)

    soup = BeautifulSoup(x_clean, 'html.parser')
    x_clean = soup.text

    if rem_punc == True:
        x_clean = remove_punctuation(x_clean)

    #tokens = word_tokenize(x_clrunean)
    return x_clean
示例#22
0
def cleanText(x):
    '''General purpose text cleaner that turns text into a list of tokens fit for Word2Vec and Doc2Vec
        INPUT: string;
        OUTPUT: list of tokens.'''

    #Fix encoding in case there is a problem
    x = ftfy.fix_encoding(x)
    #Unicode Sandwhich...
    x = x.encode('UTF-8') #encode as UTF-8
    input_code = chardet.detect(bytes(x))['encoding'] #Auto-detect encoding
    try:
        u_string = x.decode(input_code) #Decode
    except:
        u_string = x.decode('UTF-8')
    re.sub(r'\w{4}', u'xxxx', u_string, flags=re.UNICODE) #Handle funny unicode artifacts
    x = u_string.encode('UTF-8') #Encode back to UTF-8
    x = str(x) #Convert to string
    
    x = re.sub(r'[\[\]\"\'\,]',' ', x) #Remove backslashes
    x = re.sub(r'[^\w\s]',' ',x) #Insert space after end of each sentence
    x = x.split() #split string to list
    return x
示例#23
0
    def horo(self, irc, msg, args, midnight_check, psign):
        """[bool: Only print when local hour is 0 - default False] [sign: if not set will dump all signs]
        Returns horoscope in french for one zodiacal sign or all
        """
        signs = []
        tsigns = [
            'belier', 'taureau', 'gemeaux', 'cancer', 'lion', 'vierge',
            'balance', 'scorpion', 'sagittaire', 'capricorne', 'verseau',
            'poissons'
        ]

        if midnight_check and int(
                datetime.datetime.fromtimestamp(time()).strftime('%H')):
            self.log.info(
                "Horoscope plugin: checked and not [00:00-00:59] local time")
            return
        if psign:
            if psign not in tsigns:
                irc.error("Same player try again!")
            else:
                signs = [psign]
        else:
            signs = tsigns
        for sign in signs:
            url = "https://www.lhoroscope.com/horoscope-general/horoscope-%s-du-jour.asp" % sign
            try:
                result = requests.get(url)
                soup = BeautifulSoup(result.content)
                h = soup.find(class_="panel-body").text.strip()
                h = fix_encoding(h)
            except Exception as e:
                irc.error("Error {}".format(e))
                break
            else:
                irc.reply(ircutils.bold(sign.title()) + " : " + h,
                          prefixNick=False)
            sleep(2)
示例#24
0
def run() -> None:
    files = rotation.files()
    n = len(files)
    time_until = min(
        TIME_UNTIL_FULL_ROTATION,
        TIME_UNTIL_SUPPLEMENTAL_ROTATION) - datetime.timedelta(weeks=1)
    if n >= TOTAL_RUNS:
        print(
            'It is the moment of discovery, the triumph of the mind, and the end of this rotation.'
        )

        if time_until < datetime.timedelta(days=3):
            for f in files:
                print('Removing {f}'.format(f=f))
                os.remove(f)
        return
    if n == 0 and TIME_UNTIL_FULL_ROTATION > datetime.timedelta(
            7) and TIME_UNTIL_SUPPLEMENTAL_ROTATION > datetime.timedelta(7):
        print(
            'The monks of the North Tree rarely saw their kodama until the rotation, when it woke like a slumbering, angry bear.'
        )
        print('ETA: {t}'.format(
            t=dtutil.display_time(time_until.total_seconds())))
        return
    all_prices = {}
    for url in configuration.get_list('cardhoarder_urls'):
        s = fetcher_internal.fetch(url)
        s = ftfy.fix_encoding(s)
        all_prices[url] = parse_cardhoarder_prices(s)
    url = configuration.get_str('mtgotraders_url')
    if url:
        s = fetcher_internal.fetch(url)
        all_prices['mtgotraders'] = parse_mtgotraders_prices(s)

    run_number = process(all_prices)
    if run_number == TOTAL_RUNS:
        make_final_list()
示例#25
0
    def stream_json_file(self,
                         json_file_name,
                         output_file_name="parsed_tweets.csv",
                         stop_at=1000000000,
                         verbose=False):
        # Create an empty csv file that we will append to
        # Create a header row for it
        if verbose:
            print("Initalizing Output File:  %s" % output_file_name)
            print("Generating Header Row")
        with open('%s' % output_file_name, 'w') as f:
            f.write(
                'id,text,created_at,language,retweet_count,screen_name,country,user_followers_count,time_zone,user_account_location,longitude,lattitude,name\n'
            )  # Column headers and a trailing new line . MAKE SURE the /n is attached to the last field: eg. text/n

        tweet_counter = 0

        for i in open(json_file_name):

            if tweet_counter > stop_at:
                break

            try:
                # Put the data from the current tweet into a list
                # Parse the current tweet
                current_tweet = json.loads(i)

                ##################################################################

                ## Elements that are 1 level deep ## ##

                # Get the id or insert a nan if not found
                if 'id' in current_tweet:
                    id = current_tweet['id']
                else:
                    id = np.nan

                # Get text or insert nan

                if 'text' in current_tweet:
                    text = current_tweet['text']

                    # the fix_encoding function from the FTFY package takes care of weird characters
                    text = ftfy.fix_encoding(text)
                else:
                    text = np.nan

                # Get created_at or insert nan

                if 'created_at' in current_tweet:
                    created_at = current_tweet['created_at']

                else:
                    created_at = np.nan

                # Get language or insert nan

                if 'lang' in current_tweet:
                    language = current_tweet['lang']

                else:
                    language = np.nan

                    # get retweet count or insert nan

                if 'retweet_count' in current_tweet:
                    retweet_count = current_tweet['retweet_count']
                else:
                    retweet_count = np.nan

                ## Elements that are 2 levels deep ### ###

                # For elements that are 2 layers deep use != None when searching because javascript uses None as its null operator

                # get screen name or insert nan

                if 'user' in current_tweet and 'screen_name' in current_tweet[
                        'user']:
                    screen_name = current_tweet['user']['screen_name']
                else:
                    screen_name = np.nan

                # get name or insert nan
                if 'user' in current_tweet and 'name' in current_tweet['user']:
                    name = current_tweet['user']['name']
                else:
                    name = np.nan

                    # get country or insert nan

                if current_tweet['place'] != None and current_tweet['place'][
                        'country'] != None:
                    country = current_tweet['place']['country']
                else:
                    country = np.nan

                # get the author's follower count or nan

                if current_tweet['user'] != None and current_tweet['user'][
                        'followers_count'] != None:
                    followers_count = current_tweet['user']['followers_count']
                else:
                    followers_count = np.nan

                # get the timezone or nan
                if current_tweet['user'] != None and current_tweet['user'][
                        'time_zone'] != None:
                    time_zone = current_tweet['user']['time_zone']
                else:
                    time_zone = np.nan

                # get the account location or insert nan

                if current_tweet['user'] != None and current_tweet['user'][
                        'location'] != None:
                    account_location = current_tweet['user']['location']
                    account_location = ftfy.fix_encoding(account_location)
                else:
                    account_location = np.nan

                ###### Elements that are 3 levels deep ##################################

                if current_tweet['coordinates'] != None and current_tweet[
                        'coordinates']['coordinates'] != None and len(
                            current_tweet['coordinates']['coordinates']) == 2:
                    longitude = current_tweet['coordinates']['coordinates'][0]
                else:
                    longitude = np.nan

                if current_tweet['coordinates'] != None and current_tweet[
                        'coordinates']['coordinates'] != None and len(
                            current_tweet['coordinates']['coordinates']) == 2:
                    lattitude = current_tweet['coordinates']['coordinates'][1]
                else:
                    lattitude = np.nan

                ######################################################################################################
                # Assemble the row
                cleaned_current_tweet = [
                    id, text, created_at, language, retweet_count, screen_name,
                    country, followers_count, time_zone, account_location,
                    longitude, lattitude, name
                ]

                # Increment the Tweet Counter
                tweet_counter = tweet_counter + 1

                # Give the user a progress update
                if tweet_counter % 1000 == 0 and verbose:
                    print(" %d Tweets Parsed so far....." % tweet_counter)

                # append the current tweet as a row to the csv
                with open('%s' % output_file_name, 'a', newline='') as f:
                    writer = csv.writer(f)
                    writer.writerow(cleaned_current_tweet)

            except:
                pass
        if verbose:
            print(" ")
            print(" Parsing Complete:    %d Tweets Parsed " % tweet_counter)
示例#26
0
def add_items(pickle_data, items, descriptions):
    for i in range(len(items)):
        pickle_data['Food'].append(
            [ftfy.fix_encoding(items[i]),
             ftfy.fix_encoding(descriptions[i])])
    return pickle_data
示例#27
0
def enc(text):
    if text:
        return ftfy.fix_encoding(HTMLParser().unescape(text)).encode(
            "utf-16be", "replace")
示例#28
0
#cv2.imshow("Image", image)
#cv2.imshow("Output", gray)
#cv2.waitKey(5000)

# writing extracted data into a text file
text_output = open('outputbase.txt', 'w', encoding='utf-8')
text_output.write(text)
text_output.close()

file = open('outputbase.txt', 'r', encoding='utf-8')
text = file.read()
# print(text)

# Cleaning all the gibberish text
text = ftfy.fix_text(text)
text = ftfy.fix_encoding(text)
print(text)

# Initializing data variable
name = None
fname = None
dob = None
pan = None
nameline = []
dobline = []
panline = []
text0 = []
text1 = []
text2 = []

# Searching for PAN
示例#29
0
def clean_html_string(x):
    return ftfy.fix_encoding(
        ftfy.fix_text(x.replace("\n", "").replace("\t", "").strip(),
                      normalization='NFKC'))
示例#30
0
def main():
    print(ftfy.fix_encoding('주문하다 - to intent for?'))
示例#31
0
def fix_encoding(s):
    s = ftfy.fix_encoding(s)
    better, _ = codecs.escape_decode(s)
    return better.decode("utf-8").strip()
示例#32
0
def fix_encoding(elem):
    if isinstance(elem, unicode):
        return ftfy.fix_encoding(elem)
    else:
        return elem

response = urllib.urlopen('http://www.gutenberg.org/files/2600/2600-0.txt')
print(response.info())
html = response.read()
response.close() 

words = {}

html = html.split()
print(len(html))

def addWord(words,word):
    if not word in words:
        words[word] = 0
    words[word] += 1

for word in html:
    word = fix_encoding(word.decode('utf-8').lower())
    word = ''.join(ch for ch in word if ch not in exclude)
    if word.find(u"\u2014") > 0:
        word = word.split(u"\u2014")

    if not type(word) is list:
        addWord(words,word)
    else:
        for w in word:
            addWord(words,w) 

print(words)
示例#34
0
 def __init__(self, name):
     name = ftfy.fix_encoding(name)
     self._name = self._english_artist(name)
示例#35
0
 def fix_component_encodings(cls, components):
     return {k: ftfy.fix_encoding(safe_decode(v)) for k, v in six.iteritems(components)}
示例#36
0
def normalize_text(text, fix_encoding=False, strip_emojis=False):
    """
    Normalize text:

    * normalize accents (using NFC convention),
    * strip control/invisible chars and leftover combining diacritics,
    * undo ligatures,
    * normalize quotes, apostrophes and unicode characters (dashes, etc.),
    * normalize spaces (all spaces, including nbsp and tabs, will be encoded as 0x20),
    * strip and collapse multiple spaces into one,
    * etc.

    Optionally:

    * try to detect and fix encoding issues (see `ftfy.fix_encoding <https://ftfy.readthedocs.io/en/latest/>`_)
       on a per-sentence basis (delimited by newlines);
    * strip emojis (using a simple regex, not all cases are covered !)


    :param text: the text to normalize, newlines will be preserved;
    :param fix_encoding: if set, use ftfy to fix encoding issues on a per-sentence basis;
    :param strip_emojis: if set, try to find and strip unicode emojis;
    :return: the normalized text
    """
    # optionally fix encoding using ftfy
    if fix_encoding and wrong_encoding_pattern.search(text) is not None:
        try:
            import ftfy
            text = '\n'.join(
                ftfy.fix_encoding(t) if wrong_encoding_pattern.
                search(t) is not None else t for t in text.split('\n'))
        except ModuleNotFoundError:
            print(
                'WARNING: norm_punc.py, fixing encoding requires the ftfy package: pip install ftfy.'
            )

    # normalize (e.g. combining diacritics)
    text = unicodedata.normalize('NFC', text)

    # optionally strip emojis
    if strip_emojis:
        # I formally used the emoji library, which is really nice but slooooow (and doesn't cover ASCII misc symbols).
        # I thus preferred to use a simpler regex that covers most cases is is waaaay faster
        # (203ms to process 164343 short sentences, against 31s with emoji)
        text = emoji_pattern.sub(' ', text)

    # apply patterns in order
    for i, (typ, pattern, replace) in enumerate(normalization_patterns):
        if typ == REG:
            text = pattern.sub(replace, text)
        else:
            text = text.replace(pattern, replace)

    # normalize spaces
    text = spaces_pattern.sub(' ', text)

    # don't forget to normalise spaces in the beginning and end
    text = re.sub(r'(^|\n)\s+', r'\1', text)
    text = re.sub(r'\s+(\n|$)', r'\1', text)

    return text
示例#37
0
import ftfy

with open("data/small_text.json") as fp:
    for line in fp:
        print(ftfy.fix_encoding(line), end='')
示例#38
0
def fix_encoding(string):
    return jsonify({
        "fixed": ftfy.fix_encoding(string)
    })