def clean(datapath): """ Fix encoding errors in a data file and gets rid of data which still seems problematic. """ red_flags = ['â€', 'Â'] with open(datapath, 'r') as file: data = json.load(file) bad = [] good = [] for article in progress(data, 'Fixing {0} articles...'.format(len(data))): for key in ['title', 'text']: article[key] = fix_text_segment(article[key]) flagged = False for flag in red_flags: if flag in article['text'] + article['title']: bad.append(article) flagged = True break if not flagged: good.append(article) print('Getting rid of {0} bad articles.'.format(len(bad))) outpath = datapath.replace('.json', '_cleaned.json') with open(outpath, 'w') as file: json.dump(good, file)
def build_kc_vectors(articles, savepath=None): bow_vecs, concept_vecs = [], [] for a in progress(articles, 'Building article vectors...'): bow_vecs.append(a.vectors) concept_vecs.append(a.concept_vectors) print('Merging vectors...') vecs = hstack([bow_vecs, concept_vecs]) print('Using {0} features.'.format(vecs.shape[1])) if savepath: with open(savepath, 'wb') as f: pickle.dump(vecs, f) return vecs
def build_kc_vectors(articles, savepath=None): bow_vecs, concept_vecs = [], [] for a in progress(articles, "Building article vectors..."): bow_vecs.append(a.vectors) concept_vecs.append(a.concept_vectors) print("Merging vectors...") vecs = hstack([bow_vecs, concept_vecs]) print("Using {0} features.".format(vecs.shape[1])) if savepath: with open(savepath, "wb") as f: pickle.dump(vecs, f) return vecs
def train(docs, n_components=200, pipetype='stanford'): """ Trains and serializes (pickles) a vectorizing pipeline based on training data. `min_df` is set to filter out extremely rare words, since we don't want those to dominate the distance metric. `max_df` is set to filter out extremely common words, since they don't convey much information. """ pipeline = Pipeline([ ('vectorizer', CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=ConceptTokenizer(), min_df=0.01, max_df=0.9)), ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)), ('feature_reducer', TruncatedSVD(n_components=n_components)), ('normalizer', Normalizer(copy=False)) ]) print('Training on {0} docs...'.format(len(docs))) cons = [] from eval.util import progress # Hint: n_components=150 is a good value here. if pipetype == 'keyword': for doc in progress(docs, 'Extracting concepts...'): cons.append('||'.join(keywords(doc))) # Hint: n_components=200 is a good value here. elif pipetype == 'stanford': for doc in progress(docs, 'Extracting concepts...'): cons.append('||'.join(concepts(doc, strategy='stanford'))) # Hint: n_components=200 is a good value here. elif pipetype == 'spotlight': from http.client import BadStatusLine from time import sleep problems = 0 max_retries = 5 for doc in progress(docs, 'Extracting concepts...'): retries = 0 while retries < max_retries: try: cons.append('||'.join(concepts(doc, strategy='spotlight'))) break except BadStatusLine: if retries > max_retries: raise sleep(1*retries) retries += 1 problems += 1 print('Had {0} problems.'.format(problems)) else: raise Exception('Unrecognized pipeline pipetype: {0}.'.format(pipetype)) # temp with open('/Users/ftseng/{0}.json'.format(pipetype), 'w') as f: json.dump(cons, f) pipeline.fit(cons) pipe.save_pipeline(pipeline, pipetype) print('Training complete.')
def train(docs, n_components=200, pipetype='stanford'): """ Trains and serializes (pickles) a vectorizing pipeline based on training data. `min_df` is set to filter out extremely rare words, since we don't want those to dominate the distance metric. `max_df` is set to filter out extremely common words, since they don't convey much information. """ pipeline = Pipeline([ ('vectorizer', CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=ConceptTokenizer(), min_df=0.01, max_df=0.9)), ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)), ('feature_reducer', TruncatedSVD(n_components=n_components)), ('normalizer', Normalizer(copy=False)) ]) print('Training on {0} docs...'.format(len(docs))) cons = [] from eval.util import progress # Hint: n_components=150 is a good value here. if pipetype == 'keyword': for doc in progress(docs, 'Extracting concepts...'): cons.append('||'.join(keywords(doc))) # Hint: n_components=200 is a good value here. elif pipetype == 'stanford': for doc in progress(docs, 'Extracting concepts...'): cons.append('||'.join(concepts(doc, strategy='stanford'))) # Hint: n_components=200 is a good value here. elif pipetype == 'spotlight': from http.client import BadStatusLine from time import sleep problems = 0 max_retries = 5 for doc in progress(docs, 'Extracting concepts...'): retries = 0 while retries < max_retries: try: cons.append('||'.join(concepts(doc, strategy='spotlight'))) break except BadStatusLine: if retries > max_retries: raise sleep(1 * retries) retries += 1 problems += 1 print('Had {0} problems.'.format(problems)) else: raise Exception( 'Unrecognized pipeline pipetype: {0}.'.format(pipetype)) # temp with open('/Users/ftseng/{0}.json'.format(pipetype), 'w') as f: json.dump(cons, f) pipeline.fit(cons) pipe.save_pipeline(pipeline, pipetype) print('Training complete.')