def choose_small_vocabulary(big_frame, concepts_filename, language): """ Choose the vocabulary of the small frame, by eliminating the terms which: - contain more than one word - are not in ConceptNet - are not frequent """ concepts = set(line.strip() for line in open(concepts_filename)) vocab = [] for term in big_frame.index: if '_' not in term and term in concepts: try: frequency = word_frequency(uri_to_label(term), language, wordlist='large') except LookupError: frequency = word_frequency(uri_to_label(term), language, wordlist='combined') vocab.append((term, frequency)) small_vocab = [ term for term, frequency in sorted( vocab, key=lambda x: x[1], reverse=True)[:50000] ] return small_vocab
def get_vector(frame, label, language=None): """ Returns the row of a vector-space DataFrame `frame` corresponding to the text `text`. If `language` is set, this can take in plain text and normalize it to ConceptNet form. Either way, it can also take in a label that is already in ConceptNet form. """ if frame.index[0].startswith('/'): # This frame has URIs in its index if not label.startswith('/'): label = standardized_uri(language, label) try: return frame.loc[label] except KeyError: return pd.Series(index=frame.columns) else: if label.startswith('/'): label = uri_to_label(label) try: return frame.loc[replace_numbers(label)] except KeyError: # Return a vector of all NaNs return pd.Series(index=frame.columns)