def make_concept_uri(text, lang, disambiguation=None): text = ftfy.ftfy(text).strip() if disambiguation is None: text, disambiguation = handle_disambig(text) if disambiguation is not None: if isinstance(disambiguation, str): disambiguation = disambiguation.decode("utf-8") disambiguation = ftfy.ftfy(disambiguation) if lang == "en": normalized = english.normalize(text) elif lang == "ja" and disambiguation is not None: match = re.search(r"\((.*?)\)", disambiguation) if match: parenthesized = match.group(1) pos, rest = disambiguation.split("/", 1) if parenthesized in JAPANESE_PARTS_OF_SPEECH: pos = JAPANESE_PARTS_OF_SPEECH[parenthesized] else: pos = "n" disambiguation = pos + "/" + re.sub(r"\s*\((.*?)\)\s*", "", rest) normalized = preprocess_text(text).lower() else: normalized = preprocess_text(text).lower() if disambiguation is not None: disambiguation = disambiguation.strip().replace(" ", "_").lower() if disambiguation: return "/c/%s/%s/%s" % (lang, normalized.replace(" ", "_"), disambiguation) else: return "/c/%s/%s" % (lang, normalized.replace(" ", "_"))
def make_concept_uri(text, lang, disambiguation=None): text = ftfy.ftfy(text).strip() if disambiguation is None: text, disambiguation = handle_disambig(text) if disambiguation is not None: if isinstance(disambiguation, str): disambiguation = disambiguation.decode('utf-8') disambiguation = ftfy.ftfy(disambiguation) if lang == 'en': normalized = normalize(text) elif lang == 'ja' and disambiguation is not None: match = re.search(r'\((.*?)\)', disambiguation) if match: parenthesized = match.group(1) pos, rest = disambiguation.split('/', 1) if parenthesized in JAPANESE_PARTS_OF_SPEECH: pos = JAPANESE_PARTS_OF_SPEECH[parenthesized] else: pos = 'n' disambiguation = pos + '/' + re.sub(r'\s*\((.*?)\)\s*', '', rest) normalized = text.lower() else: normalized = text.lower() if disambiguation is not None: disambiguation = disambiguation.strip().replace(' ', '_').lower() if disambiguation: return '/c/%s/%s/%s' % (lang, normalized.replace(' ', '_'), disambiguation) else: return '/c/%s/%s' % (lang, normalized.replace(' ', '_'))
def make_concept_uri(text, lang, disambiguation=None): text = ftfy.ftfy(text) if disambiguation is None: text, disambiguation = handle_disambig(text) if disambiguation is not None: if isinstance(disambiguation, str): disambiguation = disambiguation.decode('utf-8') disambiguation = ftfy.ftfy(disambiguation) if lang == 'en': normalized = english.normalize(text) elif lang == 'ja' and disambiguation is not None: match = re.search(r'\((.*?)\)', disambiguation) if match: parenthesized = match.group(1) pos, rest = disambiguation.split('/', 1) if parenthesized in JAPANESE_PARTS_OF_SPEECH: pos = JAPANESE_PARTS_OF_SPEECH[parenthesized] else: pos = 'n' disambiguation = pos + '/' + re.sub(r'\s*\((.*?)\)\s*', '', rest) normalized = preprocess_text(text).lower() else: normalized = preprocess_text(text).lower() if disambiguation is not None: disambiguation = disambiguation.replace(' ', '_') if disambiguation: return '/c/%s/%s/%s' % (lang, normalized.replace(' ', '_'), disambiguation) else: return '/c/%s/%s' % (lang, normalized.replace(' ', '_'))
def load_data(messages_directory): for threadname in os.listdir(messages_directory): if threadname.startswith('.'): continue if threadname == 'stickers_used': continue # the messages are stored in the "message.json" file in the threadname directory # we want to load those json dictionaries, put them in the threads list, and do some data conversion filename = os.path.join(messages_directory, threadname, "message.json") threaddict = {} with open(filename) as f: threaddict = json.load(f) # alter every message a bit for message in threaddict['messages']: # now convert the timestamp into a real datetime object message['date'] = datetime.fromtimestamp(message['timestamp']) # for consistency, change sender_name to sender message['sender'] = message['sender_name'] if 'content' not in message: message['content'] = '' message['content'] = ftfy.ftfy(message['content']) message['sender'] = ftfy.ftfy(message['sender']) # for consistency, copy participants to members threaddict['members'] = [] if 'participants' in threaddict: for participant in threaddict['participants']: threaddict['members'].append(ftfy.ftfy(participant)) threaddict['title'] = ftfy.ftfy(threaddict['title']) threaddict['members'].append(setup.user) # add index global threads threaddict['index'] = len(threads) threads.append(threaddict)
def safe_path(origtitle): title = safe_path_component(ftfy(origtitle)) if len(title) == 0: title = origtitle = u'_' if title.startswith(u'-') or title.startswith(u'.'): title = u'_' + title try: charname = safe_path_component(unicodedata.name(origtitle[0])) except ValueError: charname = u'UNKNOWN' category = charname.split('_')[0] # some ridiculous stuff to give every article a unique name that can be # stored on multiple file systems and tab-completed if len(origtitle) == 1: pieces = [u'single_character', category, charname + '.json'] else: try: charname2 = safe_path_component(unicodedata.name(origtitle[1])) except ValueError: charname2 = u'UNKNOWN' text_to_encode = unicodedata.normalize("NFKD", safe_path_component(title[:64])) finalpart = text_to_encode.encode('punycode').rstrip('-') pieces = [charname, charname2, finalpart + '.json'] path = u'/'.join(pieces) return path
def leeds_corpus_frequencies(corpusfile, stemmer): if stemmer is None: stemmer = lambda x: x infile = codecs.open(corpusfile, encoding='utf-8') freqs = defaultdict(int) tokenfreqs = defaultdict(int) for line in infile: line = ftfy(line.strip()) if line: rank = line.split(' ')[0] if NUMBER_RE.match(rank) and line.count(' ') == 2: rank, freq, token = line.split(' ') stemmed = stemmer(token) print "%s -> %s" % (token, stemmed) freq = float(freq) freq_int = int(freq*100) for word in stemmed.split(' '): if ',' not in word: freqs[word] += freq_int if ',' not in token: tokenfreqs[token.lower()] += freq_int for key in tokenfreqs: if tokenfreqs[key] > freqs[key]: freqs[key] = tokenfreqs[key] return freqs
def load(self, path_to_file): """Loads .txt file from `path_to_file`. Arguments: path_to_file (pathlib.Path): Path to .txt file Returns: doc (chomskIE.utils.Document) Document object corresponding to .txt file in `path_to_file`. """ if not self._validate_data_path(path_to_file, is_directory=False): raise PathError(f'{path_to_file} is not a valid file path.') try: text_obj = open(path_to_file, 'r') text = text_obj.read() except UnicodeDecodeError: text_obj = open(path_to_file, 'rb') text, _ = ftfy.guess_bytes(text_obj.read()) text = ftfy.ftfy(text) name = str(path_to_file).split('/')[-1] paragraphs = [p.strip() for p in text.splitlines() if p] doc = Document(name=name, text=text, paragraphs=paragraphs) return doc
def load(self, english_model, path_to_file): """Loads .txt file from `path_to_file`. Arguments: english_model (spacy.lang) Trained SpaCy language pipeline.) path_to_file (pathlib.Path): Path to .txt file Returns: doc, spacy_doc (tuple) ``doc`` is a ``chomskIE.utils.Document`` object corresponding to .txt file in `path`. ``spacy_doc`` is a ``spacy.tokens.Document`` object corresponding to .txt files in `path` processed by ``english_model``. """ if not self._validate_data_path(path_to_file, is_directory=False): raise PathError(f'{path_to_file} is not a valid file path.') try: text_obj = open(path_to_file, 'r') text = text_obj.read() except UnicodeDecodeError: text_obj = open(path_to_file, 'rb') text, _ = ftfy.guess_bytes(text_obj.read()) text = ftfy.ftfy(text) name = str(path_to_file).split('/')[-1] spacy_doc = english_model(text) doc = Document(name=name, text=None, paragraphs=None) return doc, spacy_doc
def _read_csv(reader, header, encoding): """ Given a constructed CSV reader object, a header row that we've read, and a detected encoding, yield its rows as dictionaries. """ for row in reader: if len(row) == 0: continue row = [ftfy(cell.decode(encoding, 'replace')) for cell in row] row_list = zip(header, row) row_dict = dict(row_list) if len(row_dict['text']) == 0: continue row_dict['text'] = unicodedata.normalize( 'NFKC', row_dict['text'].strip() ) if row_dict.get('title') == '': del row_dict['title'] if 'date' in row_dict: if row_dict['date'] == '': del row_dict['date'] else: row_dict['date'] = int(row_dict['date']) if 'query' in row_dict or 'subset' in row_dict: queries = [cell[1] for cell in row_list if cell[1] != '' and (cell[0] == 'query' or cell[0] == 'subset')] if queries: row_dict['queries'] = queries if 'query' in row_dict: del row_dict['query'] if 'subset' in row_dict: del row_dict['subset'] yield row_dict
def clean_data(data): ''' Augment the raw Facebook data for our graphing use cases ''' # set timezone data['datetime'] = pd.DatetimeIndex( pd.to_datetime(data['timestamp_ms'], unit='ms')).tz_localize('UTC').tz_convert( config.TIMEZONE) # column for just date data['date'] = data["datetime"].apply(lambda d: datetime.datetime( year=d.year, month=d.month, day=d.day)).map(lambda x: x.date()) # column for term of date data['term'] = pd.to_datetime( data['datetime']).apply(lambda d: "{} {}".format( d.strftime('%Y'), util.to_term(int(d.strftime('%m'))))) # clean up sticker data data['sticker'] = data['sticker'].apply(lambda s: s['uri'] if not pd.isnull(s) else None) duplicate_likes = [ "messages/stickers_used/851582_369239386556143_1497813874_n_369239383222810.png", "messages/stickers_used/851587_369239346556147_162929011_n_369239343222814.png" ] data['sticker'] = data['sticker'].replace( duplicate_likes, "messages/stickers_used/851557_369239266556155_759568595_n_369239263222822.png" ) # format text properly data['content'] = data['content'].apply(lambda x: ftfy.ftfy(x) if type(x) == str else x) # properly set message type, adding types 'Game', 'Plan Update', 'Chat Update' warnings.filterwarnings("ignore", 'This pattern has match groups') data['game'] = data['content'].str.contains(chatstats_constants.GAME_REGEX, na=False) data['plan_update'] = data['content'].str.contains( chatstats_constants.PLAN_UPDATE_REGEX, na=False) data['chat_update'] = data['content'].str.contains( chatstats_constants.CHAT_UPDATE_REGEX, na=False) data['call_update'] = data['content'].str.contains( chatstats_constants.CALL_UPDATE_REGEX, na=False) data['type'] = data.apply(lambda x: clean_type(x), axis=1) # add first name column data['sender_first_name'] = data['sender_name'].apply( lambda s: s.split()[0]) return data
def open_csv_somehow(filename): """ Given a filename that we're told is a CSV file, detect its encoding, parse its header, and return a generator yielding its rows as dictionaries. Use the `ftfy` module internally to fix Unicode problems at the level that chardet can't deal with. """ encoding = detect_file_encoding(filename) csvfile = open(filename, 'rU') reader = csv.reader(csvfile, dialect='excel') header = reader.next() header = [ftfy(cell.decode(encoding).lower()) for cell in header] return _read_csv(reader, header, encoding)
def open_csv_somehow_py3(filename): encoding = detect_file_encoding(filename) csvfile = open(filename, 'rU', encoding=encoding) line = csvfile.readline() csvfile.seek(0) if '\t' in line: # tab-separated reader = csv.reader(csvfile, delimiter='\t') else: reader = csv.reader(csvfile, dialect='excel', newline='') header = reader.next() header = [ftfy(cell.lower().strip()) for cell in header] return _read_csv(reader, header, encoding)
def read_leeds_corpus(filename): """ Load word frequencies from a "Web as Corpus" file, collected and provided by the University of Leeds. For more information, see: http://corpus.leeds.ac.uk/list.html """ infile = codecs.open(filename, encoding='utf-8') counts = defaultdict(float) for line in infile: line = line.rstrip() if line: rank = line.split(u' ')[0] if NUMBER_RE.match(rank) and line.count(u' ') == 2: _, freq, token = line.split(u' ') token = standardize_word(ftfy(token)) freq = float(freq) counts[token] += freq return _scale_freqs(counts)
def open_csv_somehow_py2(filename): """ Open a CSV file using Python 2's CSV module, working around the deficiency where it can't handle the null bytes of UTF-16. """ encoding = detect_file_encoding(filename) if encoding.startswith('UTF-16'): csvfile = transcode_to_utf8(filename, encoding) encoding = 'UTF-8' else: csvfile = open(filename, 'rU') line = csvfile.readline() csvfile.seek(0) if '\t' in line: # tab-separated reader = csv.reader(csvfile, delimiter='\t') else: reader = csv.reader(csvfile, dialect='excel') header = reader.next() header = [ftfy(cell.decode(encoding).lower().strip()) for cell in header] return _read_csv(reader, header, encoding)
def fix_heading(heading): return ftfy(heading).strip('[]')
def fix_heading(heading): return ftfy(heading).strip("[]")
def c_cleverbot(client, message): yield from client.send_message( message.channel, ftfy.ftfy(sh.mention(message) + cw.say(sh.get_args(message, True))))
def fix(text): return ftfy(text).lower()
def encodeText(text): return ftfy.ftfy(text)
def format(s): return ftfy.ftfy(s) if type(s) == str else s def format_obj(obj: object):