def main(args): argp = _argparser().parse_args(args[1:]) # TODO: Update the default to the best one we got after experiments # TODO: Adding a default has unforeseen consequences assert argp.features for line in (l.rstrip('\n') for l in stdin): _, lbl, pre, _, post = line.split('\t') # Tokenise the context # XXX: Discards meaningful spaces pre_toks = tokenize(pre.strip()).split() post_toks = tokenize(post.strip()).split() toks = pre_toks[-3:] + [FOCUS_DUMMY] + post_toks[:3] graph, nodes = prev_next_graph(toks) for node in nodes: if node.value == FOCUS_DUMMY: focus = node break else: assert False f_vec = {} for f_set in argp.features: for f_name, f_val in F_FUNC_BY_F_SET[f_set](nodes, graph, focus): f_vec[f_name] = f_val if not f_vec: print >> stderr, 'WARNING: No features generated!' continue stdout.write(lbl) stdout.write('\t') stdout.write(' '.join('{0}:{1}'.format(f_name, f_vec[f_name]) for f_name in sorted(f_vec))) stdout.write('\n') return 0
def gen_word_list(term_dict): word_list = {} max_ip_count = 0 for k in term_dict: term_ptrs = gtbtokenize.tokenize(term_dict[k]["label"]).split() for word in term_ptrs: if not word in word_list: word_list[word] = {"terms": set([]), "mags": [], "magnitude": 0, "color": 0, "unique_ips": 0, "opacity": min_alpha, "label": word} if word.lower() in idfs: word_list[word]["idf"] = idfs[word.lower()]["IDF"] else: word_list[word]["idf"] = UNMAPPED_IDF_CONST word_list[word]["terms"].add(term_dict[k]["term_id"]) word_list[word]["mags"].append(term_dict[k]["magnitude"]) word_list[word]["unique_ips"] += term_dict[k]["unique_ips"] if word_list[word]["unique_ips"] > max_ip_count: max_ip_count = word_list[word]["unique_ips"] #print len(word_list) for k in word_list: word_list[k]["magnitude"] = np.log2(np.median(word_list[k]["mags"])*word_list[k]["idf"]) word_list[k]["color"] = len(word_list[k]["terms"]) word_list[k]["opacity"] = min_alpha + (1-min_alpha)*np.log2(word_list[word]["unique_ips"])/np.log2(max_ip_count) word_list[k]["terms"] = list(word_list[k]["terms"]) word_df = pd.DataFrame.from_dict(word_list, orient="index") word_df = word_df.sort_values("magnitude", ascending=False) sel_word_list = word_df[0:WC_THRES] return word_list
def gtb_token_boundary_gen(text): from gtbtokenize import tokenize tokens = tokenize(text).split() for o in _token_boundaries_by_alignment(tokens, text): yield o
def tokenize_multiline(text): return '\n'.join(tokenize(s) for s in text.split('\n'))
import json, re, sys, os, collections, csv, math, time import numpy as np import gtbtokenize import networkx as nx import sklearn.metrics as metrics from operator import itemgetter import pandas as pd from utils import MatrixIO, FileUtils first_cap_re = re.compile('(.)([A-Z][a-z]+)') all_cap_re = re.compile('([a-z0-9])([A-Z])') f = lambda x: re.sub(r'[^a-z0-9]', "", x) tokenize = lambda x: gtbtokenize.tokenize(x).lower() EMBEDDING_SIZE = 100 N = 24358723 UNMAPPED_IDF_CONST = 4 MIN_ED = 4 vector_file = "../lod_query/biomed_vectors_p.txt" vocab_file = "../lod_query/biomed_vocab_p.txt" idf_file = "../lod_query/idf_file.tsv" vocab_dict = {} enc_vocab_dict = {} stopWords = set([ "a", "also", "although", "am", "an", "and", "are", ".", "NNNN", "VVVV", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "being", "bill", "both", "bottom", "but", "by", "call", "can", "con", "could", "de", "do", "done", "eg", "etc", "even", "ever", "find", "for", "found", "from", "get", "give", "go", "had", "has", "have", "he", "her", "here", "hers", "herself", "him", "himself", "his", "how", "however", "if", "in", "inc", "into", "is", "it", "its", "itself", "keep",