Exemplo n.º 1
0
def build_mention_dict():
    mentions_dict_filename = os.path.join(LOCAL_ROOT, 'mentions.dict')

    mentions = itertools.chain(
        read_json_data('training', 'mentions.json.gz'),
        read_json_data('validation', 'mentions.json.gz'),
        read_json_data('test', 'mentions.json.gz'),
    )

    documents = (record['mentions'] for record in mentions)
    mentions_dict = Dictionary.from_documents(documents)

    mentions_dict.save(mentions_dict_filename)
Exemplo n.º 2
0
def build_entity_dict():
    entities_dict_filename = os.path.join(LOCAL_ROOT, 'entities.dict')

    outgoing = itertools.chain(
        read_json_data('training', 'outgoing.json.gz'),
        read_json_data('validation', 'outgoing.json.gz'),
        read_json_data('test', 'outgoing.json.gz'),
    )

    documents = (record['outgoing_entity_ids'] for record in outgoing)
    entities_dict = Dictionary.from_documents(documents)

    entities_dict.save(entities_dict_filename)
Exemplo n.º 3
0
def create_dct(df: pd.DataFrame,
               bigram: gensim.models.phrases.Phraser,
               trigram: gensim.models.phrases.Phraser,
               save: bool = False):
    """
    Create dictionary from dataframe
    Input:
    - df: dataframe with column "text"
    - bigram: bigram phraser
    - trigram: trigram phraser
    - save: if true, vocabulary is saved in files
    """
    def wrapper_phrase(generator):
        for item in generator:
            ngram = trigram[bigram[item.text.split(" ")]]
            yield ngram

    dct = Dictionary.from_documents(wrapper_phrase(df.itertuples()))
    dct.filter_extremes(no_below=1000, no_above=0.80, keep_n=150000)
    if save == True:
        dct.save_as_text("./gensim_dct.txt")
        dct.save("./gensim_dct")
Exemplo n.º 4
0
import pandas as pd

from gensim.models import TfidfModel
from gensim.corpora import Dictionary

from twip.constant import DATA_PATH

np = pd.np

dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'),
                    engine='python')
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python')
with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f:
    df = pd.DataFrame.from_csv(f, encoding='utf8')
d = Dictionary.from_documents(df.tokens)
# fail

df.tokens[0]
df.tokens
df.tokens.iloc[0]
df['tokens'] = df.txt.str.split()
df.tokens
df.tokens.iloc[0]
d = Dictionary.from_documents(df.txt.str.split())
len(d)
tfidf = TfidfModel(d)
tfidf = TfidfModel(dictionary=d)
tfidf
len(tfidf)
df.tokens[0]
Exemplo n.º 5
0
 def build_dict(docs):
     dictionay = Dictionary.from_documents(docs)
     return dictionay
Exemplo n.º 6
0

# Load previously cleaned data

# In[6]:

dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python')
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python')
with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f:
    df = pd.DataFrame.from_csv(f, encoding='utf8')
df.tokens


# In[7]:

d = Dictionary.from_documents(df.tokens)


# In[11]:

df.tokens.iloc[0]


# When we said "QUOTE_NONNUMERIC" we didn't mean **ALL** nonnumeric fields ;)

# In[16]:

df['tokens'] = df.txt.str.split()
df.tokens

Exemplo n.º 7
0
import pandas as pd

from gensim.models import TfidfModel
from gensim.corpora import Dictionary

from twip.constant import DATA_PATH


np = pd.np

dates = pd.read_csv(os.path.join(DATA_PATH, 'datetimes.csv.gz'), engine='python')
nums = pd.read_csv(os.path.join(DATA_PATH, 'numbers.csv.gz'), engine='python')
with gzip.open(os.path.join(DATA_PATH, 'text.csv.gz'), 'rb') as f:
    df = pd.DataFrame.from_csv(f, encoding='utf8')
d = Dictionary.from_documents(df.tokens)
# fail

df.tokens[0]
df.tokens
df.tokens.iloc[0]
df['tokens'] = df.txt.str.split()
df.tokens
df.tokens.iloc[0]
d = Dictionary.from_documents(df.txt.str.split())
len(d)
tfidf = TfidfModel(d)
tfidf = TfidfModel(dictionary=d)
tfidf
len(tfidf)
df.tokens[0]