def generate_feature_matrix(data, stemmer, **prune_params): config = Configuration() config.enable_image_fetching = False config.use_meta_language = False goose = Goose(config) _parser = HTMLParser() sr_index = HashedIndex() for url_path, label in data.items(): if os.path.exists(url_path): with open(url_path, 'r') as html_file: html_text = html_file.read() text = unicode(goose.extract(raw_html=html_text).cleaned_text) text = _parser.unescape(text) for token in word_tokenize(text, stemmer=stemmer): sr_index.add_term_occurrence(token, url_path) sr_index.prune(**prune_params) X = sr_index.generate_feature_matrix(mode='tfidf') y = np.zeros(len(sr_index.documents())) for index, doc in enumerate(sr_index.documents()): y[index] = 0 if data[doc] is None else 1 return X, y
def cmd_readstream(args, t, active_events): import textwrap from goose import Goose, Configuration config = Configuration() config.enable_image_fetching = False g = Goose(config) raw_stream = True for arg in args: if arg == "articles": raw_stream = False for event in active_events: print event if event.query_id.startswith("TS13"): corpus = cuttsum.corpora.EnglishAndUnknown2013() elif event.query_id.startswith("TS14"): corpus = cuttsum.corpora.SerifOnly2014() else: raise Exception("Bad query id: {}".format(event.query_id)) if raw_stream is True: from cuttsum.trecdata import SCChunkResource si_iter = SCChunkResource().streamitem_iter(event, corpus) else: from cuttsum.pipeline import ArticlesResource si_iter = ArticlesResource().streamitem_iter(event, corpus) for hour, path, si in si_iter: if si.body.clean_visible is not None: print si.stream_id try: text_height = t.height - 4 #n_chars = t. article = g.extract(raw_html=si.body.clean_html) lines = textwrap.wrap(article.cleaned_text) idx = 0 while 1: print t.clear print "hour:", hour print "title:", article.title print "article:" print "\n".join(lines[idx:idx + text_height]) #print article.cleaned_text with t.cbreak(): char = t.inkey() if char == "i" and idx > 0: idx -= 1 #idx - 1 if idx > 0 else 0 elif char == "k" and idx + text_height < len( lines): idx += 1 elif char == "l": break except Exception, e: print e continue
def cmd_readstream(args, t, active_events): import textwrap from goose import Goose, Configuration config = Configuration() config.enable_image_fetching = False g = Goose(config) raw_stream = True for arg in args: if arg == "articles": raw_stream = False for event in active_events: print event if event.query_id.startswith("TS13"): corpus = cuttsum.corpora.EnglishAndUnknown2013() elif event.query_id.startswith("TS14"): corpus = cuttsum.corpora.SerifOnly2014() else: raise Exception("Bad query id: {}".format(event.query_id)) if raw_stream is True: from cuttsum.trecdata import SCChunkResource si_iter = SCChunkResource().streamitem_iter(event, corpus) else: from cuttsum.pipeline import ArticlesResource si_iter = ArticlesResource().streamitem_iter(event, corpus) for hour, path, si in si_iter: if si.body.clean_visible is not None: print si.stream_id try: text_height = t.height-4 #n_chars = t. article = g.extract(raw_html=si.body.clean_html) lines = textwrap.wrap(article.cleaned_text) idx = 0 while 1: print t.clear print "hour:", hour print "title:", article.title print "article:" print "\n".join(lines[idx:idx+text_height]) #print article.cleaned_text with t.cbreak(): char = t.inkey() if char == "i" and idx > 0: idx -= 1 #idx - 1 if idx > 0 else 0 elif char == "k" and idx + text_height < len(lines): idx += 1 elif char == "l": break except Exception, e: print e continue
def generate_feature_matrix(wiki, data, n_concepts=10, **word_concept_params): """ Transforms a given data source to a corresponding feature matrix and label vector based on the "Bag of Concepts" model which uses Wikipedia as an exogenous knowledge source for Word Sense Disambiguation and as additional domain knowledge. Contains logging code which is displayed depending on the currently set logging level of the root logger. :param wiki: WikiIndex instance to some database index :param data: data labels loaded using a load_data_source method :param n_concepts: number of concepts to use per page. :param word_concept_params: word concept parameters to use for generation of concepts. :return: Numpy Feature Matrix and Label Vector. """ config = Configuration() config.enable_image_fetching = False config.use_meta_language = False goose = Goose(config) results = {} concepts = set() # Iterate through the data and perform training for index, (abs_path, label) in enumerate(data.items()): if not os.path.exists(abs_path): continue with open(abs_path, 'r') as fp: html_text = fp.read() # Determine relative path using a simple heuristic cutoff = abs_path.find('pages/') rel_path = abs_path[cutoff + 6:] logging.info('\n%d: http://%s' % (index, rel_path[:-3])) article = goose.extract(raw_html=html_text) if len(article.cleaned_text) > 500: logging.info('%s (%s)', article.title, label) search_results, terms, query_vector = wiki.word_concepts( article.cleaned_text, article.title, **word_concept_params) if search_results: results[abs_path] = [(sr.page_id, sr.weight) for sr in search_results[:n_concepts]] # Remove any concepts which have a weight of 0 results[abs_path] = filter(lambda x: x[1] > 0, results[abs_path]) for search_result in search_results[:n_concepts]: concepts.add(search_result.page_id) logging.info(search_results[:n_concepts]) else: logging.warn('No word concepts returned') else: logging.info('Document is of insufficient length') shape = (len(results), len(concepts)) concepts_index = dict([(b, a) for (a, b) in enumerate(concepts)]) feature_matrix = np.zeros(shape=shape) label_vector = np.zeros(len(results)) for i, (abs_path, page_list) in enumerate(results.iteritems()): label_vector[i] = 1 if data[abs_path] is not None else 0 for page_id, weight in page_list: j = concepts_index[page_id] feature_matrix[i, j] = weight return feature_matrix, label_vector
def __init__(self,need_stem): #set up goose config = Configuration() config.enable_image_fetching = False self._g = Goose(config) self._need_stem = need_stem
from goose import Configuration, Goose from HTMLParser import HTMLParser from datasource import load_data_source from index.hashedindex import HashedIndex, load_meta from utils import search_files if __name__ == '__main__': import time t0 = time.time() _parser = HTMLParser() _config = Configuration() _config.enable_image_fetching = False _config.use_meta_language = False _goose = Goose(_config) # Lancaster Stemmer is very very slow _stemmer = textparser.NullStemmer() data_path = '/home/michaela/Development/Reddit-Testing-Data' # Set the parameters to the program over here force_reindex = False parameters = { 'samples': 800, 'subreddit': 'python',
import cuttsum.judgements import cuttsum.events import cuttsum.corpora from cuttsum.misc import stringify_streamcorpus_sentence event = cuttsum.events.get_events(by_query_ids=["TS13.1"])[0] corpus = cuttsum.corpora.EnglishAndUnknown2013() example_path = "/scratch/t-chkedz/trec-data/articles/gold/2012_buenos_aires_rail_disaster/2012-02-23-01.sc.gz" example_id = "1329959700-18d497cf08e3500f195066be60e6a201" matches = cuttsum.judgements.get_merged_dataframe() from goose import Goose, Configuration config = Configuration() config.enable_image_fetching = False g = Goose(config) def si_to_df(si): sents = [] for s, sent in enumerate(si.body.sentences["lingpipe"]): sents.append( with sc.Chunk(path=example_path, mode="rb", message=corpus.sc_msg()) as chunk: for si in chunk: if si.stream_id == example_id: for s, sent in enumerate(si.body.sentences["lingpipe"]):
def __init__(self, need_stem): #set up goose config = Configuration() config.enable_image_fetching = False self._g = Goose(config) self._need_stem = need_stem
def do_job_unit(self, event, corpus, unit, **kwargs): extractor = kwargs.get("extractor", "gold") data_dir = os.path.join(self.dir_, extractor, event.fs_name()) chunks_resource = SCChunkResource() if not os.path.exists(data_dir): try: os.makedirs(data_dir) except OSError as exc: if exc.errno == errno.EEXIST and os.path.isdir(data_dir): pass if extractor == "gold": import cuttsum.judgements all_matches = cuttsum.judgements.get_matches() matches = all_matches[all_matches["query id"] == event.query_id] stream_ids = set( matches["update id"].apply( lambda x: "-".join(x.split("-")[:-1])).tolist()) hours = set([datetime.utcfromtimestamp( int(update_id.split("-")[0])).replace( minute=0, second=0) for update_id in matches["update id"].tolist()]) hours = sorted(list(hours)) hour = hours[unit] output_path = self.get_chunk_path(event, extractor, hour, corpus) gold_si = [] for path in chunks_resource.get_chunks_for_hour(hour, corpus, event): with sc.Chunk(path=path, mode="rb", message=corpus.sc_msg()) as chunk: for si in chunk: if si.stream_id in stream_ids: gold_si.append(si) gold_si.sort(key=lambda x: x.stream_id) for si in gold_si: print si.stream_id if os.path.exists(output_path): os.remove(path) with sc.Chunk(path=output_path, mode="wb", message=corpus.sc_msg()) as chunk: for si in gold_si: chunk.add(si) elif extractor == "goose": import nltk from nltk.tokenize import WordPunctTokenizer sent_tok = nltk.data.load('tokenizers/punkt/english.pickle') word_tok = WordPunctTokenizer() from goose import Goose, Configuration config = Configuration() config.enable_image_fetching = False g = Goose(config) hour = event.list_event_hours()[unit] output_path_tmp = self.get_chunk_template(event, extractor, hour, corpus) good_si = [] for path in chunks_resource.get_chunks_for_hour(hour, corpus, event): try: with sc.Chunk(path=path, mode="rb", message=corpus.sc_msg()) as chunk: for si in chunk: if si.body.clean_visible is None: continue article_text = self._get_goose_text(g, si) if article_text is None: continue if not self._contains_query(event, article_text): continue art_pretty = sent_tok.tokenize(article_text) art_sents = [word_tok.tokenize(sent) for sent in art_pretty] df = si2df(si) I = self._map_goose2streamitem( art_sents, df["words"].tolist()) if "serif" in si.body.sentences: si_sentences = si.body.sentences["serif"] elif "lingpipe" in si.body.sentences: si_sentences = si.body.sentences["lingpipe"] else: raise Exception("Bad sentence annotator.") ann = sc.Annotator() ann.annotator_id = "goose" si.body.sentences["goose"] = [sc.Sentence() for _ in si_sentences] for i_goose, i_si in enumerate(I): #print art_pretty[i_goose] #print df.loc[i_si, "sent text"] #print tokens = [sc.Token(token=token.encode("utf-8")) for token in art_sents[i_goose]] si.body.sentences["goose"][i_si].tokens.extend( tokens) good_si.append(si) except TypeError: continue #if len(good_si) == 0: # print "Nothing in hour:", hour # return output_path = output_path_tmp.format(len(good_si)) odir = os.path.dirname(output_path) if not os.path.exists(odir): os.makedirs(odir) if os.path.exists(output_path): os.remove(path) good_si.sort(key=lambda x: x.stream_id) for si in good_si: print si.stream_id if os.path.exists(output_path): os.remove(output_path) print "Writing to", output_path with sc.Chunk(path=output_path, mode="wb", message=corpus.sc_msg()) as chunk: for si in good_si: chunk.add(si) else: raise Exception("extractor: {} not implemented!".format(extractor))
def generate_feature_matrix(wiki, data, n_concepts=10, **word_concept_params): """ Transforms a given data source to a corresponding feature matrix and label vector based on the "Bag of Concepts" model which uses Wikipedia as an exogenous knowledge source for Word Sense Disambiguation and as additional domain knowledge. Contains logging code which is displayed depending on the currently set logging level of the root logger. :param wiki: WikiIndex instance to some database index :param data: data labels loaded using a load_data_source method :param n_concepts: number of concepts to use per page. :param word_concept_params: word concept parameters to use for generation of concepts. :return: Numpy Feature Matrix and Label Vector. """ config = Configuration() config.enable_image_fetching = False config.use_meta_language = False goose = Goose(config) results = {} concepts = set() # Iterate through the data and perform training for index, (abs_path, label) in enumerate(data.items()): if not os.path.exists(abs_path): continue with open(abs_path, 'r') as fp: html_text = fp.read() # Determine relative path using a simple heuristic cutoff = abs_path.find('pages/') rel_path = abs_path[cutoff + 6:] logging.info('\n%d: http://%s' % (index, rel_path[:-3])) article = goose.extract(raw_html=html_text) if len(article.cleaned_text) > 500: logging.info('%s (%s)', article.title, label) search_results, terms, query_vector = wiki.word_concepts(article.cleaned_text, article.title, **word_concept_params) if search_results: results[abs_path] = [(sr.page_id, sr.weight) for sr in search_results[:n_concepts]] # Remove any concepts which have a weight of 0 results[abs_path] = filter(lambda x: x[1] > 0, results[abs_path]) for search_result in search_results[:n_concepts]: concepts.add(search_result.page_id) logging.info(search_results[:n_concepts]) else: logging.warn('No word concepts returned') else: logging.info('Document is of insufficient length') shape = (len(results), len(concepts)) concepts_index = dict([(b, a) for (a, b) in enumerate(concepts)]) feature_matrix = np.zeros(shape=shape) label_vector = np.zeros(len(results)) for i, (abs_path, page_list) in enumerate(results.iteritems()): label_vector[i] = 1 if data[abs_path] is not None else 0 for page_id, weight in page_list: j = concepts_index[page_id] feature_matrix[i, j] = weight return feature_matrix, label_vector