def run_opencyc(input_file, output_file): """ Read an .nq file containing OpenCyc data, outputting a file of ConceptNet edges and a file of mappings between the Semantic Web and ConceptNet. """ out = MsgpackStreamWriter(output_file) labels = {} unlabels = defaultdict(set) seen_external_urls = set() # Read through the file once, finding all the "preferred labels". We will # use these as the surface texts for the nodes. for subj, pred, obj, _graph in parse_nquads( open(input_file, encoding='utf-8')): if pred['url'] == RDF_LABEL: labels[subj['url']] = obj['text'] unlabels[obj['text']].add(subj['url']) # Read through the file again and extract ConceptNet edges. for subj, pred, obj, _graph in parse_nquads( open(input_file, encoding='utf-8')): rel_name = resource_name(pred['url']) web_subj = subj.get('url') web_obj = obj.get('url') if rel_name == 'subClassOf' and web_obj is not None and web_subj in labels and web_obj in labels: subj_label = labels[web_subj] obj_label = labels[web_obj] if '_' in subj_label or '_' in obj_label: continue if subj_label.startswith('xsd:') or obj_label.startswith('xsd:'): continue subj_words = set(simple_tokenize(subj_label)) obj_words = set(simple_tokenize(obj_label)) if (subj_words & BLACKLIST_WORDS) or (obj_words & BLACKLIST_WORDS): continue if len(subj_words) > 4 or len(obj_words) > 4: continue subj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_subj) obj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_obj) out.write( opencyc_edge('/r/IsA', subj_uri, obj_uri, subj_label, obj_label)) if (subj_uri, web_subj) not in seen_external_urls: out.write(external_url_edge(subj_uri, web_subj)) seen_external_urls.add((subj_uri, web_subj)) if (obj_uri, web_obj) not in seen_external_urls: out.write(external_url_edge(obj_uri, web_obj)) seen_external_urls.add((obj_uri, web_obj)) elif rel_name == 'sameAs' and web_subj in labels and web_obj.startswith( 'http://umbel.org/'): subj_label = labels[web_subj] subj_uri = standardized_concept_uri('en', subj_label) if (subj_uri, web_obj) not in seen_external_urls: out.write(external_url_edge(subj_uri, web_obj)) seen_external_urls.add((subj_uri, web_obj)) out.close()
def cyc_to_conceptnet_uri(labels, unlabels, uri): """ Convert a Cyc URI to a ConceptNet URI, with the following rules: - Use the RDF label as the text. (Alternate labels appear to provide synonyms, but these are generally automatically generated and aren't particularly accurate.) - The part of speech is always 'n'. Cyc describes its concepts in a noun-like way. At worst, they're gerunds -- instead of "to eat", Cyc would define an event of "Eating". - If two different Cyc URIs have the same text, we will attempt to disambiguate them using the last component of the Cyc URI. - Remove the camel-casing from the Cyc URI component. If the phrase we get is the same as the natural-language label, disregard it as an uninformative disambiguation. Otherwise, that is the disambiguation text. A possible objection: Our disambiguation doesn't distinguish Cyc URIs that differ in capitalization, or differ by using underscores instead of camel-case. However, I've noticed that such URIs are usually *unintentional* duplicates that are okay to merge. If they were really unrelated concepts that needed to be distinguished, someone would have given them different names. Even so, we end up with some unnecessary word senses, such as different senses for "mens clothing", "men's clothing", and "men s clothing". """ label = filter_stopwords(labels[uri]) if len(unlabels[label]) >= 2: disambig = filter_stopwords(un_camel_case(resource_name(uri))) if simple_tokenize(disambig) != simple_tokenize(label): return standardized_concept_uri('en', label, 'n', 'opencyc', disambig) return standardized_concept_uri('en', label, 'n')
def read_freqs(filename, cutoff=0, lang=None): """ Read words and their frequencies from a CSV file. Only words with a frequency greater than or equal to `cutoff` are returned. If `cutoff` is greater than 0, the csv file must be sorted by frequency in descending order. If lang is given, read_freqs will apply language specific preprocessing operations. """ raw_counts = defaultdict(float) total = 0. with open(filename, encoding='utf-8', newline='') as infile: for key, strval in csv.reader(infile): val = float(strval) if val < cutoff: break tokens = tokenize( key, lang) if lang is not None else simple_tokenize(key) for token in tokens: # Use += so that, if we give the reader concatenated files with # duplicates, it does the right thing raw_counts[fix_text(token)] += val total += val for word in raw_counts: raw_counts[word] /= total return raw_counts
def read_values(filename, cutoff=0, max_words=1e8, lang=None): """ Read words and their frequency or count values from a CSV file. Returns a dictionary of values and the total of all values. Only words with a value greater than or equal to `cutoff` are returned. In addition, only up to `max_words` words are read. If `cutoff` is greater than 0 or `max_words` is smaller than the list, the csv file must be sorted by value in descending order, so that the most frequent words are kept. If `lang` is given, it will apply language-specific tokenization to the words that it reads. """ values = defaultdict(float) total = 0. with open(filename, encoding='utf-8', newline='') as infile: for key, strval in csv.reader(infile): val = float(strval) key = fix_text(key) if val < cutoff or len(values) >= max_words: break tokens = tokenize( key, lang) if lang is not None else simple_tokenize(key) for token in tokens: # Use += so that, if we give the reader concatenated files with # duplicates, it does the right thing values[token] += val total += val return values, total
def read_values(filename, cutoff=0, max_size=1e8, lang=None): """ Read words and their frequency or count values from a CSV file. Returns a dictionary of values and the total of all values. Only words with a value greater than or equal to `cutoff` are returned. If `cutoff` is greater than 0, the csv file must be sorted by value in descending order. If `lang` is given, it will apply language-specific tokenization to the words that it reads. """ values = defaultdict(float) total = 0. with open(filename, encoding='utf-8', newline='') as infile: for key, strval in csv.reader(infile): val = float(strval) key = fix_text(key) if val < cutoff or len(values) >= max_size: break tokens = tokenize(key, lang) if lang is not None else simple_tokenize(key) for token in tokens: # Use += so that, if we give the reader concatenated files with # duplicates, it does the right thing values[token] += val total += val return values, total
def preprocess_and_tokenize_text(lang, text): """ Get a string made from the tokens in the text, joined by underscores. >>> preprocess_and_tokenize_text('en', ' cat') 'cat' >>> preprocess_and_tokenize_text('en', 'Italian supercat') 'italian_supercat' >>> preprocess_and_tokenize_text('en', 'a big dog') 'a_big_dog' >>> preprocess_and_tokenize_text('en', 'Test?!') 'test' >>> preprocess_and_tokenize_text('en', 'TEST.') 'test' >>> preprocess_and_tokenize_text('en', 'test/test') 'test_test' >>> preprocess_and_tokenize_text('de', ' u\N{COMBINING DIAERESIS}ber\\n') 'über' >>> preprocess_and_tokenize_text('en', 'embedded' + chr(9) + 'tab') 'embedded_tab' >>> preprocess_and_tokenize_text('en', '_') '' >>> preprocess_and_tokenize_text('en', ',') '' """ text = preprocess_text(text.replace('_', ' '), lang) tokens = simple_tokenize(text) return '_'.join(tokens)
def filter_stopwords(text): words = [ word for word in simple_tokenize(text) if word not in MORE_STOPWORDS ] text2 = ' '.join(words) if not text2: text2 = text return text2
def text_to_vector(self, language, text): """ Used in Story Cloze Test to create a vector for text. """ tokens = wordfreq.simple_tokenize(text) weighted_terms = [(uri_prefix(standardized_uri(language, token)), 1.) for token in tokens] return self.get_vector(weighted_terms, oov_vector=False)
def standardized_concept_uri(lang, text, *more): """ Make the appropriate URI for a concept in a particular language, including removing English stopwords, normalizing the text in a way appropriate to that language (using the text normalization from wordfreq), and joining its tokens with underscores in a concept URI. This text normalization can smooth over some writing differences: for example, it removes vowel points from Arabic words, and it transliterates Serbian written in the Cyrillic alphabet to the Latin alphabet so that it can match other words written in Latin letters. 'more' contains information to distinguish word senses, such as a part of speech or a WordNet domain. The items in 'more' get lowercased and joined with underscores, but skip many of the other steps -- for example, they won't have stopwords removed. >>> standardized_concept_uri('en', 'this is a test') '/c/en/this_is_test' >>> standardized_concept_uri('en', 'this is a test', 'n', 'example phrase') '/c/en/this_is_test/n/example_phrase' >>> standardized_concept_uri('sh', 'симетрија') '/c/sh/simetrija' """ lang = lang.lower() if lang in LCODE_ALIASES: lang = LCODE_ALIASES[lang] if lang == 'en': token_filter = english_filter else: token_filter = None text = preprocess_text(text.replace('_', ' '), lang) tokens = simple_tokenize(text) if token_filter is not None: tokens = token_filter(tokens) norm_text = '_'.join(tokens) more_text = [] for item in more: if item is not None: tokens = simple_tokenize(item.replace('_', ' ')) if token_filter is not None: tokens = token_filter(tokens) more_text.append('_'.join(tokens)) return concept_uri(lang, norm_text, *more_text)
def standardize_username(username): """ Convert usernames into a canonical form that can be used in URIs. If the username is an e-mail address, just keep the part before the @ sign. """ name = username.strip('@').split('@')[0] name = '_'.join(simple_tokenize(name.replace('_', ' '))) return name
def test_simple_tokenize(): # When Japanese is run through simple_tokenize -- either because it's # tagged with the wrong language, or because we want to pass through # Japanese text without getting MeCab involved -- it will be split at # boundaries between Japanese and non-Japanese scripts, but all Japanese # scripts will be stuck together. Here the switch between hiragana # (ひらがな) and katakana (カタカナ) is not a boundary, but the switch # between katakana and romaji is. # # We used to try to infer word boundaries between hiragana and katakana, # but this leads to edge cases that are unsolvable without a dictionary. ja_text = 'ひらがなカタカナromaji' assert simple_tokenize(ja_text) == ['ひらがなカタカナ', 'romaji'] # An example that would be multiple tokens if tokenized as 'ja' via MeCab, # but sticks together in simple_tokenize assert simple_tokenize('おはようございます') == ['おはようございます'] # Names that use the weird possessive marker ヶ, which is technically a # katakana even though it's being used like a kanji, stay together as one # token assert simple_tokenize("犬ヶ島") == ["犬ヶ島"] # The word in ConceptNet that made me notice that simple_tokenize used # to have a problem with the character 々 assert simple_tokenize("晴々しい") == ["晴々しい"] # Explicit word separators are still token boundaries, such as the dot # between "toner" and "cartridge" in "toner cartridge" assert simple_tokenize("トナー・カートリッジ") == ["トナー", "カートリッジ"] # This word has multiple weird characters that aren't quite kanji in it, # and is in the dictionary assert simple_tokenize("見ヶ〆料") == ["見ヶ〆料"]
def standardize_text(text, token_filter=None): """ Get a string made from the tokens in the text, joined by underscores. The tokens may have a language-specific `token_filter` applied to them. See `standardize_as_list()`. >>> standardize_text(' cat') 'cat' >>> standardize_text('a big dog', token_filter=english_filter) 'big_dog' >>> standardize_text('Italian supercat') 'italian_supercat' >>> standardize_text('a big dog') 'a_big_dog' >>> standardize_text('a big dog', token_filter=english_filter) 'big_dog' >>> standardize_text('to go', token_filter=english_filter) 'go' >>> standardize_text('Test?!') 'test' >>> standardize_text('TEST.') 'test' >>> standardize_text('test/test') 'test_test' >>> standardize_text(' u\N{COMBINING DIAERESIS}ber\\n') 'über' >>> standardize_text('embedded' + chr(9) + 'tab') 'embedded_tab' >>> standardize_text('_') '' >>> standardize_text(',') '' """ tokens = simple_tokenize(text.replace('_', ' ')) if token_filter is not None: tokens = token_filter(tokens) return '_'.join(tokens)
def count_tokens(filename): """ Count tokens that appear in a file, running each line through our simple tokenizer. URLs will be skipped, and Unicode errors will become separate tokens containing '�'. """ counts = defaultdict(int) with open(filename, encoding='utf-8', errors='replace') as infile: for line in infile: line = URL_RE.sub('', line.strip()) for token in simple_tokenize(line): counts[token] += 1 return counts
def main(in_file, out_file, err): print("TOKENIZING", file=err) processed = 0 last_print = 0 step = 100000 for line in in_file: if processed - last_print > step: last_print += step print("\033[1K\rProcessed %s tokens"%step, file=err, end='') tokens = simple_tokenize(line) print(" ".join(tokens), file=out_file) processed += len(tokens) print("\033[1K\rProcessed %s tokens"%processed, file=err) print(file=err)
def main(in_file, out_file, err): print("TOKENIZING", file=err) processed = 0 last_print = 0 step = 100000 for line in in_file: if processed - last_print > step: last_print += step print("\033[1K\rProcessed %s tokens" % step, file=err, end='') tokens = simple_tokenize(line) print(" ".join(tokens), file=out_file) processed += len(tokens) print("\033[1K\rProcessed %s tokens" % processed, file=err) print(file=err)
def valid_concept_name(text): """ Returns whether this text can be reasonably represented in a concept URI. This helps to protect against making useless concepts out of empty strings or punctuation. >>> valid_concept_name('word') True >>> valid_concept_name('the') True >>> valid_concept_name(',,') False >>> valid_concept_name(',') False >>> valid_concept_name('/') False >>> valid_concept_name(' ') False """ tokens = simple_tokenize(text.replace('_', ' ')) return len(tokens) > 0
def build_sources(parts_dict, preposition_fix=False): """ Create the 'source' information for an assertion. The output is a list of (conjunction, weight) tuples, where 'conjunction' is a list of sources that combined to produce this assertion. Later, inside the 'make_edge' function, these will be combined into an '/and' node. """ creator_source = {} creator_node = join_uri( '/s/contributor/omcs', standardize_username(parts_dict["creator"]) ) creator_source['contributor'] = creator_node activity = parts_dict["activity"] activity = '_'.join(simple_tokenize(activity.replace('_', ' '))) activity_node = join_uri('/s/activity/omcs', activity) creator_source['activity'] = activity_node if preposition_fix: creator_source['process'] = '/s/process/preposition_fix' creator_source['weight'] = 1. sources = [creator_source] for vote in parts_dict["votes"]: username = vote[0] if username == parts_dict["creator"]: continue vote_int = vote[1] vote_source = { 'contributor': join_uri( '/s/contributor/omcs', standardize_username(username) ), 'activity': '/s/activity/omcs/vote', 'weight': float(vote_int), } sources.append(vote_source) return sources
def build_sources(parts_dict, preposition_fix=False): """ Create the 'source' information for an assertion. The output is a list of (conjunction, weight) tuples, where 'conjunction' is a list of sources that combined to produce this assertion. Later, inside the 'make_edge' function, these will be combined into an '/and' node. """ creator_source = {} creator_node = join_uri('/s/contributor/omcs', standardize_username(parts_dict["creator"])) creator_source['contributor'] = creator_node activity = parts_dict["activity"] activity = '_'.join(simple_tokenize(activity.replace('_', ' '))) activity_node = join_uri('/s/activity/omcs', activity) creator_source['activity'] = activity_node if preposition_fix: creator_source['process'] = '/s/process/preposition_fix' creator_source['weight'] = 1. sources = [creator_source] for vote in parts_dict["votes"]: username = vote[0] if username == parts_dict["creator"]: continue vote_int = vote[1] vote_source = { 'contributor': join_uri('/s/contributor/omcs', standardize_username(username)), 'activity': '/s/activity/omcs/vote', 'weight': float(vote_int) } sources.append(vote_source) return sources
new_tweets = api.GetUserTimeline(screen_name="realDonaldTrump", count=200) all_tweets = [] all_tweets.extend(new_tweets) while len(new_tweets) != 0: oldest_id = all_tweets[-1].id - 1 new_tweets = api.GetUserTimeline(screen_name="realDonaldTrump", count=200, max_id=oldest_id) all_tweets.extend(new_tweets) print("{} tweets retrieved so far...".format(len(all_tweets))) trump_tweets = [tweet.full_text for tweet in all_tweets] # Now, let's take a look at Trump's 10 most recent tweets just for kicks print(trump_tweets[:10]) # Let's take a look at a few of Trump's most commonly-used words tokenized_tweets = [simple_tokenize(tweet) for tweet in trump_tweets] counts = {} for tokenized in tokenized_tweets: for word in tokenized: if word not in stopwords.words("english"): if word != "https" and word != "t.co" and word != "rt": if word not in counts: counts[word] = 1 else: counts[word] += 1 sorted_counts = sorted(counts, key=counts.get, reverse=True) top_20_keys = sorted_counts[:20] top_20_values = [counts[key] for key in top_20_keys] plt.style.use("ggplot") plt.bar(top_20_keys, top_20_values, color="blue") plt.ylabel("Frequencies")
def filter_stopwords(text): words = [word for word in simple_tokenize(text) if word not in MORE_STOPWORDS] text2 = ' '.join(words) if not text2: text2 = text return text2
def run_opencyc(input_file, output_file): """ Read an .nq file containing OpenCyc data, outputting a file of ConceptNet edges and a file of mappings between the Semantic Web and ConceptNet. """ out = MsgpackStreamWriter(output_file) labels = {} unlabels = defaultdict(set) seen_external_urls = set() # Read through the file once, finding all the "preferred labels". We will # use these as the surface texts for the nodes. for subj, pred, obj, _graph in parse_nquads(open(input_file, encoding='utf-8')): if pred['url'] == RDF_LABEL: labels[subj['url']] = obj['text'] unlabels[obj['text']].add(subj['url']) # Read through the file again and extract ConceptNet edges. for subj, pred, obj, _graph in parse_nquads(open(input_file, encoding='utf-8')): rel_name = resource_name(pred['url']) web_subj = subj.get('url') web_obj = obj.get('url') if ( rel_name == 'subClassOf' and web_obj is not None and web_subj in labels and web_obj in labels ): subj_label = labels[web_subj] obj_label = labels[web_obj] if '_' in subj_label or '_' in obj_label: continue if subj_label.startswith('xsd:') or obj_label.startswith('xsd:'): continue subj_words = set(simple_tokenize(subj_label)) obj_words = set(simple_tokenize(obj_label)) if (subj_words & BLACKLIST_WORDS) or (obj_words & BLACKLIST_WORDS): continue if len(subj_words) > 4 or len(obj_words) > 4: continue subj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_subj) obj_uri = cyc_to_conceptnet_uri(labels, unlabels, web_obj) out.write(opencyc_edge('/r/IsA', subj_uri, obj_uri, subj_label, obj_label)) if (subj_uri, web_subj) not in seen_external_urls: out.write(external_url_edge(subj_uri, web_subj)) seen_external_urls.add((subj_uri, web_subj)) if (obj_uri, web_obj) not in seen_external_urls: out.write(external_url_edge(obj_uri, web_obj)) seen_external_urls.add((obj_uri, web_obj)) elif ( rel_name == 'sameAs' and web_subj in labels and web_obj.startswith('http://umbel.org/') ): subj_label = labels[web_subj] subj_uri = standardized_concept_uri('en', subj_label) if (subj_uri, web_obj) not in seen_external_urls: out.write(external_url_edge(subj_uri, web_obj)) seen_external_urls.add((subj_uri, web_obj)) out.close()